scsi: qla2xxx: Fix crash in PCIe error handling
authorQuinn Tran <qutran@marvell.com>
Mon, 29 Mar 2021 08:52:25 +0000 (01:52 -0700)
committerMartin K. Petersen <martin.petersen@oracle.com>
Tue, 30 Mar 2021 02:38:58 +0000 (22:38 -0400)
BUG: unable to handle kernel NULL pointer dereference at (null)
IP: qla2x00_abort_isp+0x21/0x6b0 [qla2xxx] PGD 0 P4D 0
Oops: 0000 [#1] SMP PTI
CPU: 0 PID: 1715 Comm: kworker/0:2
Tainted: GOE 4.12.14-122.37-default #1 SLE12-SP5
Hardware name: HPE Superdome Flex/Superdome Flex, BIOS
Bundle:3.30.100 SFW:IP147.007.004.017.000.2009211957 09/21/2020
Workqueue: events aer_recover_work_func
task: ffff9e399c14ca80 task.stack: ffffc1c58e4ac000
RIP: 0010:qla2x00_abort_isp+0x21/0x6b0 [qla2xxx]
RSP: 0018:ffffc1c58e4afd50 EFLAGS: 00010282
RAX: 0000000000000000 RBX: ffff9e419cdef480 RCX: 0000000000000000
RDX: ffff9e399c14ca80 RSI: 0000000000000246 RDI: ffff9e419bbc27b8
RBP: ffff9e419bbc27b8 R08: 0000000000000004 R09: 00000000a0440000
R10: 0000000000000000 R11: ffff9e399416d1a0 R12: ffff9e419cdef000
R13: ffff9e3a7cfae800 R14: ffff9e3a7cfae800 R15: 00000000000000c0
FS:  0000000000000000(0000) GS:ffff9e39a0000000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 00000006cd00a005 CR4: 00000000007606f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
  qla2xxx_pci_slot_reset+0x141/0x160 [qla2xxx]
  report_slot_reset+0x41/0x80
  ? merge_result.part.4+0x30/0x30
  pci_walk_bus+0x70/0x90
  pcie_do_recovery+0x1db/0x2e0
  aer_recover_work_func+0xc2/0xf0
  process_one_work+0x14c/0x390

Disable board_disable logic where driver resources are freed while OS is in
the process of recovering the adapter.

Link: https://lore.kernel.org/r/20210329085229.4367-9-njavali@marvell.com
Tested-by: Laurence Oberman <loberman@redhat.com>
Signed-off-by: Quinn Tran <qutran@marvell.com>
Signed-off-by: Nilesh Javali <njavali@marvell.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
drivers/scsi/qla2xxx/qla_dbg.c
drivers/scsi/qla2xxx/qla_def.h
drivers/scsi/qla2xxx/qla_gbl.h
drivers/scsi/qla2xxx/qla_init.c
drivers/scsi/qla2xxx/qla_inline.h
drivers/scsi/qla2xxx/qla_iocb.c
drivers/scsi/qla2xxx/qla_isr.c
drivers/scsi/qla2xxx/qla_mbx.c
drivers/scsi/qla2xxx/qla_nvme.c
drivers/scsi/qla2xxx/qla_os.c

index 144a893e7335b2782fb163437af3a8d3ca8d05fc..f2d05592c1e2f96f0e22b0d3f8bbf413126a4156 100644 (file)
@@ -113,8 +113,13 @@ qla27xx_dump_mpi_ram(struct qla_hw_data *ha, uint32_t addr, uint32_t *ram,
        uint32_t stat;
        ulong i, j, timer = 6000000;
        int rval = QLA_FUNCTION_FAILED;
+       scsi_qla_host_t *vha = pci_get_drvdata(ha->pdev);
 
        clear_bit(MBX_INTERRUPT, &ha->mbx_cmd_flags);
+
+       if (qla_pci_disconnected(vha, reg))
+               return rval;
+
        for (i = 0; i < ram_dwords; i += dwords, addr += dwords) {
                if (i + dwords > ram_dwords)
                        dwords = ram_dwords - i;
@@ -138,6 +143,9 @@ qla27xx_dump_mpi_ram(struct qla_hw_data *ha, uint32_t addr, uint32_t *ram,
                while (timer--) {
                        udelay(5);
 
+                       if (qla_pci_disconnected(vha, reg))
+                               return rval;
+
                        stat = rd_reg_dword(&reg->host_status);
                        /* Check for pending interrupts. */
                        if (!(stat & HSRX_RISC_INT))
@@ -192,9 +200,13 @@ qla24xx_dump_ram(struct qla_hw_data *ha, uint32_t addr, __be32 *ram,
        uint32_t dwords = qla2x00_gid_list_size(ha) / 4;
        uint32_t stat;
        ulong i, j, timer = 6000000;
+       scsi_qla_host_t *vha = pci_get_drvdata(ha->pdev);
 
        clear_bit(MBX_INTERRUPT, &ha->mbx_cmd_flags);
 
+       if (qla_pci_disconnected(vha, reg))
+               return rval;
+
        for (i = 0; i < ram_dwords; i += dwords, addr += dwords) {
                if (i + dwords > ram_dwords)
                        dwords = ram_dwords - i;
@@ -216,8 +228,10 @@ qla24xx_dump_ram(struct qla_hw_data *ha, uint32_t addr, __be32 *ram,
                ha->flags.mbox_int = 0;
                while (timer--) {
                        udelay(5);
-                       stat = rd_reg_dword(&reg->host_status);
+                       if (qla_pci_disconnected(vha, reg))
+                               return rval;
 
+                       stat = rd_reg_dword(&reg->host_status);
                        /* Check for pending interrupts. */
                        if (!(stat & HSRX_RISC_INT))
                                continue;
index ca7cebae1d786ae2200a7d9e87dbc06307d67340..def4d99f80e99a60a87e4db136a4ba9c036b93c2 100644 (file)
@@ -396,6 +396,7 @@ typedef union {
        } b;
 } port_id_t;
 #define INVALID_PORT_ID        0xFFFFFF
+#define ISP_REG16_DISCONNECT 0xFFFF
 
 static inline le_id_t be_id_to_le(be_id_t id)
 {
@@ -3857,6 +3858,13 @@ struct qla_hw_data_stat {
        u32 num_mpi_reset;
 };
 
+/* refer to pcie_do_recovery reference */
+typedef enum {
+       QLA_PCI_RESUME,
+       QLA_PCI_ERR_DETECTED,
+       QLA_PCI_MMIO_ENABLED,
+       QLA_PCI_SLOT_RESET,
+} pci_error_state_t;
 /*
  * Qlogic host adapter specific data structure.
 */
@@ -4607,6 +4615,7 @@ struct qla_hw_data {
 #define DEFAULT_ZIO_THRESHOLD 5
 
        struct qla_hw_data_stat stat;
+       pci_error_state_t pci_error_state;
 };
 
 struct active_regions {
@@ -4727,6 +4736,7 @@ typedef struct scsi_qla_host {
 #define FX00_CRITEMP_RECOVERY  25
 #define FX00_HOST_INFO_RESEND  26
 #define QPAIR_ONLINE_CHECK_NEEDED      27
+#define DO_EEH_RECOVERY                28
 #define DETECT_SFP_CHANGE      29
 #define N2N_LOGIN_NEEDED       30
 #define IOCB_WORK_ACTIVE       31
index 6486f97d649e209cad5672a6ff8eccc9bbab4332..fae5cae6f0a8da823018bb642a10b45cf175fb39 100644 (file)
@@ -224,6 +224,7 @@ extern int qla2x00_post_uevent_work(struct scsi_qla_host *, u32);
 
 extern int qla2x00_post_uevent_work(struct scsi_qla_host *, u32);
 extern void qla2x00_disable_board_on_pci_error(struct work_struct *);
+extern void qla_eeh_work(struct work_struct *);
 extern void qla2x00_sp_compl(srb_t *sp, int);
 extern void qla2xxx_qpair_sp_free_dma(srb_t *sp);
 extern void qla2xxx_qpair_sp_compl(srb_t *sp, int);
@@ -235,6 +236,8 @@ int qla24xx_post_relogin_work(struct scsi_qla_host *vha);
 void qla2x00_wait_for_sess_deletion(scsi_qla_host_t *);
 void qla24xx_process_purex_rdp(struct scsi_qla_host *vha,
                               struct purex_item *pkt);
+void qla_pci_set_eeh_busy(struct scsi_qla_host *);
+void qla_schedule_eeh_work(struct scsi_qla_host *);
 
 /*
  * Global Functions in qla_mid.c source file.
index 19681d3c5b7a545d0442fee8c1600b8880bde987..9c5782e946e00ef6fb608ef0086d04236e2d8ef6 100644 (file)
@@ -6932,22 +6932,18 @@ qla2x00_abort_isp_cleanup(scsi_qla_host_t *vha)
        }
        spin_unlock_irqrestore(&ha->vport_slock, flags);
 
-       if (!ha->flags.eeh_busy) {
-               /* Make sure for ISP 82XX IO DMA is complete */
-               if (IS_P3P_TYPE(ha)) {
-                       qla82xx_chip_reset_cleanup(vha);
-                       ql_log(ql_log_info, vha, 0x00b4,
-                           "Done chip reset cleanup.\n");
-
-                       /* Done waiting for pending commands.
-                        * Reset the online flag.
-                        */
-                       vha->flags.online = 0;
-               }
+       /* Make sure for ISP 82XX IO DMA is complete */
+       if (IS_P3P_TYPE(ha)) {
+               qla82xx_chip_reset_cleanup(vha);
+               ql_log(ql_log_info, vha, 0x00b4,
+                      "Done chip reset cleanup.\n");
 
-               /* Requeue all commands in outstanding command list. */
-               qla2x00_abort_all_cmds(vha, DID_RESET << 16);
+               /* Done waiting for pending commands. Reset online flag */
+               vha->flags.online = 0;
        }
+
+       /* Requeue all commands in outstanding command list. */
+       qla2x00_abort_all_cmds(vha, DID_RESET << 16);
        /* memory barrier */
        wmb();
 }
@@ -6978,6 +6974,12 @@ qla2x00_abort_isp(scsi_qla_host_t *vha)
                if (vha->hw->flags.port_isolated)
                        return status;
 
+               if (qla2x00_isp_reg_stat(ha)) {
+                       ql_log(ql_log_info, vha, 0x803f,
+                              "ISP Abort - ISP reg disconnect, exiting.\n");
+                       return status;
+               }
+
                if (test_and_clear_bit(ISP_ABORT_TO_ROM, &vha->dpc_flags)) {
                        ha->flags.chip_reset_done = 1;
                        vha->flags.online = 1;
@@ -7017,8 +7019,18 @@ qla2x00_abort_isp(scsi_qla_host_t *vha)
 
                ha->isp_ops->get_flash_version(vha, req->ring);
 
+               if (qla2x00_isp_reg_stat(ha)) {
+                       ql_log(ql_log_info, vha, 0x803f,
+                              "ISP Abort - ISP reg disconnect pre nvram config, exiting.\n");
+                       return status;
+               }
                ha->isp_ops->nvram_config(vha);
 
+               if (qla2x00_isp_reg_stat(ha)) {
+                       ql_log(ql_log_info, vha, 0x803f,
+                              "ISP Abort - ISP reg disconnect post nvmram config, exiting.\n");
+                       return status;
+               }
                if (!qla2x00_restart_isp(vha)) {
                        clear_bit(RESET_MARKER_NEEDED, &vha->dpc_flags);
 
index e80e41b6c9e1deeacd5df30cd5927586fef7a73b..82937c6bd9c4941f5600217924c05a6b1f03efd6 100644 (file)
@@ -432,3 +432,49 @@ qla_put_iocbs(struct qla_qpair *qp, struct iocb_resource *iores)
        }
        iores->res_type = RESOURCE_NONE;
 }
+
+#define ISP_REG_DISCONNECT 0xffffffffU
+/**************************************************************************
+ * qla2x00_isp_reg_stat
+ *
+ * Description:
+ *        Read the host status register of ISP before aborting the command.
+ *
+ * Input:
+ *       ha = pointer to host adapter structure.
+ *
+ *
+ * Returns:
+ *       Either true or false.
+ *
+ * Note: Return true if there is register disconnect.
+ **************************************************************************/
+static inline
+uint32_t qla2x00_isp_reg_stat(struct qla_hw_data *ha)
+{
+       struct device_reg_24xx __iomem *reg = &ha->iobase->isp24;
+       struct device_reg_82xx __iomem *reg82 = &ha->iobase->isp82;
+
+       if (IS_P3P_TYPE(ha))
+               return ((rd_reg_dword(&reg82->host_int)) == ISP_REG_DISCONNECT);
+       else
+               return ((rd_reg_dword(&reg->host_status)) ==
+                       ISP_REG_DISCONNECT);
+}
+
+static inline
+bool qla_pci_disconnected(struct scsi_qla_host *vha,
+                         struct device_reg_24xx __iomem *reg)
+{
+       uint32_t stat;
+       bool ret = false;
+
+       stat = rd_reg_dword(&reg->host_status);
+       if (stat == 0xffffffff) {
+               ql_log(ql_log_info, vha, 0x8041,
+                      "detected PCI disconnect.\n");
+               qla_schedule_eeh_work(vha);
+               ret = true;
+       }
+       return ret;
+}
index fea3a92e772441bc3d60032f78f26dea69db584c..38b5bdde2405531d7a8b336db502469c6b75a16f 100644 (file)
@@ -1645,8 +1645,14 @@ qla24xx_start_scsi(srb_t *sp)
                goto queuing_error;
 
        if (req->cnt < (req_cnt + 2)) {
-               cnt = IS_SHADOW_REG_CAPABLE(ha) ? *req->out_ptr :
-                   rd_reg_dword_relaxed(req->req_q_out);
+               if (IS_SHADOW_REG_CAPABLE(ha)) {
+                       cnt = *req->out_ptr;
+               } else {
+                       cnt = rd_reg_dword_relaxed(req->req_q_out);
+                       if (qla2x00_check_reg16_for_disconnect(vha, cnt))
+                               goto queuing_error;
+               }
+
                if (req->ring_index < cnt)
                        req->cnt = cnt - req->ring_index;
                else
@@ -1842,8 +1848,13 @@ qla24xx_dif_start_scsi(srb_t *sp)
                goto queuing_error;
 
        if (req->cnt < (req_cnt + 2)) {
-               cnt = IS_SHADOW_REG_CAPABLE(ha) ? *req->out_ptr :
-                   rd_reg_dword_relaxed(req->req_q_out);
+               if (IS_SHADOW_REG_CAPABLE(ha)) {
+                       cnt = *req->out_ptr;
+               } else {
+                       cnt = rd_reg_dword_relaxed(req->req_q_out);
+                       if (qla2x00_check_reg16_for_disconnect(vha, cnt))
+                               goto queuing_error;
+               }
                if (req->ring_index < cnt)
                        req->cnt = cnt - req->ring_index;
                else
@@ -1922,6 +1933,7 @@ queuing_error:
 
        qla_put_iocbs(sp->qpair, &sp->iores);
        spin_unlock_irqrestore(&ha->hardware_lock, flags);
+
        return QLA_FUNCTION_FAILED;
 }
 
@@ -1991,8 +2003,14 @@ qla2xxx_start_scsi_mq(srb_t *sp)
                goto queuing_error;
 
        if (req->cnt < (req_cnt + 2)) {
-               cnt = IS_SHADOW_REG_CAPABLE(ha) ? *req->out_ptr :
-                   rd_reg_dword_relaxed(req->req_q_out);
+               if (IS_SHADOW_REG_CAPABLE(ha)) {
+                       cnt = *req->out_ptr;
+               } else {
+                       cnt = rd_reg_dword_relaxed(req->req_q_out);
+                       if (qla2x00_check_reg16_for_disconnect(vha, cnt))
+                               goto queuing_error;
+               }
+
                if (req->ring_index < cnt)
                        req->cnt = cnt - req->ring_index;
                else
@@ -2203,8 +2221,14 @@ qla2xxx_dif_start_scsi_mq(srb_t *sp)
                goto queuing_error;
 
        if (req->cnt < (req_cnt + 2)) {
-               cnt = IS_SHADOW_REG_CAPABLE(ha) ? *req->out_ptr :
-                   rd_reg_dword_relaxed(req->req_q_out);
+               if (IS_SHADOW_REG_CAPABLE(ha)) {
+                       cnt = *req->out_ptr;
+               } else {
+                       cnt = rd_reg_dword_relaxed(req->req_q_out);
+                       if (qla2x00_check_reg16_for_disconnect(vha, cnt))
+                               goto queuing_error;
+               }
+
                if (req->ring_index < cnt)
                        req->cnt = cnt - req->ring_index;
                else
@@ -2281,6 +2305,7 @@ queuing_error:
 
        qla_put_iocbs(sp->qpair, &sp->iores);
        spin_unlock_irqrestore(&qpair->qp_lock, flags);
+
        return QLA_FUNCTION_FAILED;
 }
 
@@ -2325,6 +2350,11 @@ __qla2x00_alloc_iocbs(struct qla_qpair *qpair, srb_t *sp)
                        cnt = qla2x00_debounce_register(
                            ISP_REQ_Q_OUT(ha, &reg->isp));
 
+               if (!qpair->use_shadow_reg && cnt == ISP_REG16_DISCONNECT) {
+                       qla_schedule_eeh_work(vha);
+                       return NULL;
+               }
+
                if  (req->ring_index < cnt)
                        req->cnt = cnt - req->ring_index;
                else
@@ -3740,6 +3770,9 @@ qla2x00_start_sp(srb_t *sp)
        void *pkt;
        unsigned long flags;
 
+       if (vha->hw->flags.eeh_busy)
+               return -EIO;
+
        spin_lock_irqsave(qp->qp_lock_ptr, flags);
        pkt = __qla2x00_alloc_iocbs(sp->qpair, sp);
        if (!pkt) {
@@ -3957,8 +3990,14 @@ qla2x00_start_bidir(srb_t *sp, struct scsi_qla_host *vha, uint32_t tot_dsds)
 
        /* Check for room on request queue. */
        if (req->cnt < req_cnt + 2) {
-               cnt = IS_SHADOW_REG_CAPABLE(ha) ? *req->out_ptr :
-                   rd_reg_dword_relaxed(req->req_q_out);
+               if (IS_SHADOW_REG_CAPABLE(ha)) {
+                       cnt = *req->out_ptr;
+               } else {
+                       cnt = rd_reg_dword_relaxed(req->req_q_out);
+                       if (qla2x00_check_reg16_for_disconnect(vha, cnt))
+                               goto queuing_error;
+               }
+
                if  (req->ring_index < cnt)
                        req->cnt = cnt - req->ring_index;
                else
@@ -3997,5 +4036,6 @@ qla2x00_start_bidir(srb_t *sp, struct scsi_qla_host *vha, uint32_t tot_dsds)
        qla2x00_start_iocbs(vha, req);
 queuing_error:
        spin_unlock_irqrestore(&ha->hardware_lock, flags);
+
        return rval;
 }
index 0938119e8a5f87ebaaddfa82ce3bf83ab120db14..11d6e0db07fedc659f0899650f91765b91668d5c 100644 (file)
@@ -270,12 +270,7 @@ qla2x00_check_reg32_for_disconnect(scsi_qla_host_t *vha, uint32_t reg)
                if (!test_and_set_bit(PFLG_DISCONNECTED, &vha->pci_flags) &&
                    !test_bit(PFLG_DRIVER_REMOVING, &vha->pci_flags) &&
                    !test_bit(PFLG_DRIVER_PROBING, &vha->pci_flags)) {
-                       /*
-                        * Schedule this (only once) on the default system
-                        * workqueue so that all the adapter workqueues and the
-                        * DPC thread can be shutdown cleanly.
-                        */
-                       schedule_work(&vha->hw->board_disable);
+                       qla_schedule_eeh_work(vha);
                }
                return true;
        } else
@@ -1657,8 +1652,6 @@ global_port_update:
        case MBA_TEMPERATURE_ALERT:
                ql_dbg(ql_dbg_async, vha, 0x505e,
                    "TEMPERATURE ALERT: %04x %04x %04x\n", mb[1], mb[2], mb[3]);
-               if (mb[1] == 0x12)
-                       schedule_work(&ha->board_disable);
                break;
 
        case MBA_TRANS_INSERT:
index 06c99963b2c97d14cb5d49bd7ecf34d8d72d5b16..0149f84cdd8e342850c2bbcbb1dea95fe56dc15e 100644 (file)
@@ -161,7 +161,8 @@ qla2x00_mailbox_command(scsi_qla_host_t *vha, mbx_cmd_t *mcp)
        /* check if ISP abort is active and return cmd with timeout */
        if ((test_bit(ABORT_ISP_ACTIVE, &base_vha->dpc_flags) ||
            test_bit(ISP_ABORT_RETRY, &base_vha->dpc_flags) ||
-           test_bit(ISP_ABORT_NEEDED, &base_vha->dpc_flags)) &&
+           test_bit(ISP_ABORT_NEEDED, &base_vha->dpc_flags) ||
+           ha->flags.eeh_busy) &&
            !is_rom_cmd(mcp->mb[0])) {
                ql_log(ql_log_info, vha, 0x1005,
                    "Cmd 0x%x aborted with timeout since ISP Abort is pending\n",
index 0237588f48b04c4d0a14b89798cbfce378f7b28b..0cacb667a88b9ad7f20942ad5aee9c3c1e49ae18 100644 (file)
@@ -398,8 +398,13 @@ static inline int qla2x00_start_nvme_mq(srb_t *sp)
        }
        req_cnt = qla24xx_calc_iocbs(vha, tot_dsds);
        if (req->cnt < (req_cnt + 2)) {
-               cnt = IS_SHADOW_REG_CAPABLE(ha) ? *req->out_ptr :
-                   rd_reg_dword_relaxed(req->req_q_out);
+               if (IS_SHADOW_REG_CAPABLE(ha)) {
+                       cnt = *req->out_ptr;
+               } else {
+                       cnt = rd_reg_dword_relaxed(req->req_q_out);
+                       if (qla2x00_check_reg16_for_disconnect(vha, cnt))
+                               goto queuing_error;
+               }
 
                if (req->ring_index < cnt)
                        req->cnt = cnt - req->ring_index;
@@ -536,6 +541,7 @@ static inline int qla2x00_start_nvme_mq(srb_t *sp)
 
 queuing_error:
        spin_unlock_irqrestore(&qpair->qp_lock, flags);
+
        return rval;
 }
 
index cb4155f8fd44a3fee733df07c1b11acb241d808d..12959e3874cb3ddc8eb06399dbc433999c8f4d68 100644 (file)
@@ -971,6 +971,13 @@ qla2xxx_mqueuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd,
                goto qc24_fail_command;
        }
 
+       if (!qpair->online) {
+               ql_dbg(ql_dbg_io, vha, 0x3077,
+                      "qpair not online. eeh_busy=%d.\n", ha->flags.eeh_busy);
+               cmd->result = DID_NO_CONNECT << 16;
+               goto qc24_fail_command;
+       }
+
        if (!fcport || fcport->deleted) {
                cmd->result = DID_IMM_RETRY << 16;
                goto qc24_fail_command;
@@ -1200,35 +1207,6 @@ qla2x00_wait_for_chip_reset(scsi_qla_host_t *vha)
        return return_status;
 }
 
-#define ISP_REG_DISCONNECT 0xffffffffU
-/**************************************************************************
-* qla2x00_isp_reg_stat
-*
-* Description:
-*      Read the host status register of ISP before aborting the command.
-*
-* Input:
-*      ha = pointer to host adapter structure.
-*
-*
-* Returns:
-*      Either true or false.
-*
-* Note:        Return true if there is register disconnect.
-**************************************************************************/
-static inline
-uint32_t qla2x00_isp_reg_stat(struct qla_hw_data *ha)
-{
-       struct device_reg_24xx __iomem *reg = &ha->iobase->isp24;
-       struct device_reg_82xx __iomem *reg82 = &ha->iobase->isp82;
-
-       if (IS_P3P_TYPE(ha))
-               return ((rd_reg_dword(&reg82->host_int)) == ISP_REG_DISCONNECT);
-       else
-               return ((rd_reg_dword(&reg->host_status)) ==
-                       ISP_REG_DISCONNECT);
-}
-
 /**************************************************************************
 * qla2xxx_eh_abort
 *
@@ -1262,6 +1240,7 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd)
        if (qla2x00_isp_reg_stat(ha)) {
                ql_log(ql_log_info, vha, 0x8042,
                    "PCI/Register disconnect, exiting.\n");
+               qla_pci_set_eeh_busy(vha);
                return FAILED;
        }
 
@@ -1455,6 +1434,7 @@ qla2xxx_eh_device_reset(struct scsi_cmnd *cmd)
        if (qla2x00_isp_reg_stat(ha)) {
                ql_log(ql_log_info, vha, 0x803e,
                    "PCI/Register disconnect, exiting.\n");
+               qla_pci_set_eeh_busy(vha);
                return FAILED;
        }
 
@@ -1471,6 +1451,7 @@ qla2xxx_eh_target_reset(struct scsi_cmnd *cmd)
        if (qla2x00_isp_reg_stat(ha)) {
                ql_log(ql_log_info, vha, 0x803f,
                    "PCI/Register disconnect, exiting.\n");
+               qla_pci_set_eeh_busy(vha);
                return FAILED;
        }
 
@@ -1506,6 +1487,7 @@ qla2xxx_eh_bus_reset(struct scsi_cmnd *cmd)
        if (qla2x00_isp_reg_stat(ha)) {
                ql_log(ql_log_info, vha, 0x8040,
                    "PCI/Register disconnect, exiting.\n");
+               qla_pci_set_eeh_busy(vha);
                return FAILED;
        }
 
@@ -1583,7 +1565,7 @@ qla2xxx_eh_host_reset(struct scsi_cmnd *cmd)
        if (qla2x00_isp_reg_stat(ha)) {
                ql_log(ql_log_info, vha, 0x8041,
                    "PCI/Register disconnect, exiting.\n");
-               schedule_work(&ha->board_disable);
+               qla_pci_set_eeh_busy(vha);
                return SUCCESS;
        }
 
@@ -6668,6 +6650,9 @@ qla2x00_do_dpc(void *data)
 
                schedule();
 
+               if (test_and_clear_bit(DO_EEH_RECOVERY, &base_vha->dpc_flags))
+                       qla_pci_set_eeh_busy(base_vha);
+
                if (!base_vha->flags.init_done || ha->flags.mbox_busy)
                        goto end_loop;
 
@@ -7384,6 +7369,8 @@ static void qla_pci_error_cleanup(scsi_qla_host_t *vha)
        int i;
        unsigned long flags;
 
+       ql_dbg(ql_dbg_aer, vha, 0x9000,
+              "%s\n", __func__);
        ha->chip_reset++;
 
        ha->base_qpair->chip_reset = ha->chip_reset;
@@ -7393,28 +7380,16 @@ static void qla_pci_error_cleanup(scsi_qla_host_t *vha)
                            ha->base_qpair->chip_reset;
        }
 
-       /* purge MBox commands */
-       if (atomic_read(&ha->num_pend_mbx_stage3)) {
-               clear_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags);
-               complete(&ha->mbx_intr_comp);
-       }
-
-       i = 0;
-
-       while (atomic_read(&ha->num_pend_mbx_stage3) ||
-           atomic_read(&ha->num_pend_mbx_stage2) ||
-           atomic_read(&ha->num_pend_mbx_stage1)) {
-               msleep(20);
-               i++;
-               if (i > 50)
-                       break;
-       }
-
-       ha->flags.purge_mbox = 0;
+       /*
+        * purge mailbox might take a while. Slot Reset/chip reset
+        * will take care of the purge
+        */
 
        mutex_lock(&ha->mq_lock);
+       ha->base_qpair->online = 0;
        list_for_each_entry(qpair, &base_vha->qp_list, qp_list_elem)
                qpair->online = 0;
+       wmb();
        mutex_unlock(&ha->mq_lock);
 
        qla2x00_mark_all_devices_lost(vha);
@@ -7451,14 +7426,17 @@ qla2xxx_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
 {
        scsi_qla_host_t *vha = pci_get_drvdata(pdev);
        struct qla_hw_data *ha = vha->hw;
+       pci_ers_result_t ret = PCI_ERS_RESULT_NEED_RESET;
 
-       ql_dbg(ql_dbg_aer, vha, 0x9000,
-           "PCI error detected, state %x.\n", state);
+       ql_log(ql_log_warn, vha, 0x9000,
+              "PCI error detected, state %x.\n", state);
+       ha->pci_error_state = QLA_PCI_ERR_DETECTED;
 
        if (!atomic_read(&pdev->enable_cnt)) {
                ql_log(ql_log_info, vha, 0xffff,
                        "PCI device is disabled,state %x\n", state);
-               return PCI_ERS_RESULT_NEED_RESET;
+               ret = PCI_ERS_RESULT_NEED_RESET;
+               goto out;
        }
 
        switch (state) {
@@ -7468,11 +7446,12 @@ qla2xxx_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
                        set_bit(QPAIR_ONLINE_CHECK_NEEDED, &vha->dpc_flags);
                        qla2xxx_wake_dpc(vha);
                }
-               return PCI_ERS_RESULT_CAN_RECOVER;
+               ret = PCI_ERS_RESULT_CAN_RECOVER;
+               break;
        case pci_channel_io_frozen:
-               ha->flags.eeh_busy = 1;
-               qla_pci_error_cleanup(vha);
-               return PCI_ERS_RESULT_NEED_RESET;
+               qla_pci_set_eeh_busy(vha);
+               ret = PCI_ERS_RESULT_NEED_RESET;
+               break;
        case pci_channel_io_perm_failure:
                ha->flags.pci_channel_io_perm_failure = 1;
                qla2x00_abort_all_cmds(vha, DID_NO_CONNECT << 16);
@@ -7480,9 +7459,12 @@ qla2xxx_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
                        set_bit(QPAIR_ONLINE_CHECK_NEEDED, &vha->dpc_flags);
                        qla2xxx_wake_dpc(vha);
                }
-               return PCI_ERS_RESULT_DISCONNECT;
+               ret = PCI_ERS_RESULT_DISCONNECT;
        }
-       return PCI_ERS_RESULT_NEED_RESET;
+out:
+       ql_dbg(ql_dbg_aer, vha, 0x600d,
+              "PCI error detected returning [%x].\n", ret);
+       return ret;
 }
 
 static pci_ers_result_t
@@ -7496,6 +7478,10 @@ qla2xxx_pci_mmio_enabled(struct pci_dev *pdev)
        struct device_reg_2xxx __iomem *reg = &ha->iobase->isp;
        struct device_reg_24xx __iomem *reg24 = &ha->iobase->isp24;
 
+       ql_log(ql_log_warn, base_vha, 0x9000,
+              "mmio enabled\n");
+
+       ha->pci_error_state = QLA_PCI_MMIO_ENABLED;
        if (IS_QLA82XX(ha))
                return PCI_ERS_RESULT_RECOVERED;
 
@@ -7519,10 +7505,11 @@ qla2xxx_pci_mmio_enabled(struct pci_dev *pdev)
                ql_log(ql_log_info, base_vha, 0x9003,
                    "RISC paused -- mmio_enabled, Dumping firmware.\n");
                qla2xxx_dump_fw(base_vha);
-
-               return PCI_ERS_RESULT_NEED_RESET;
-       } else
-               return PCI_ERS_RESULT_RECOVERED;
+       }
+       /* set PCI_ERS_RESULT_NEED_RESET to trigger call to qla2xxx_pci_slot_reset */
+       ql_dbg(ql_dbg_aer, base_vha, 0x600d,
+              "mmio enabled returning.\n");
+       return PCI_ERS_RESULT_NEED_RESET;
 }
 
 static pci_ers_result_t
@@ -7534,9 +7521,10 @@ qla2xxx_pci_slot_reset(struct pci_dev *pdev)
        int rc;
        struct qla_qpair *qpair = NULL;
 
-       ql_dbg(ql_dbg_aer, base_vha, 0x9004,
-           "Slot Reset.\n");
+       ql_log(ql_log_warn, base_vha, 0x9004,
+              "Slot Reset.\n");
 
+       ha->pci_error_state = QLA_PCI_SLOT_RESET;
        /* Workaround: qla2xxx driver which access hardware earlier
         * needs error state to be pci_channel_io_online.
         * Otherwise mailbox command timesout.
@@ -7570,16 +7558,24 @@ qla2xxx_pci_slot_reset(struct pci_dev *pdev)
                qpair->online = 1;
        mutex_unlock(&ha->mq_lock);
 
+       ha->flags.eeh_busy = 0;
        base_vha->flags.online = 1;
        set_bit(ABORT_ISP_ACTIVE, &base_vha->dpc_flags);
-       if (ha->isp_ops->abort_isp(base_vha) == QLA_SUCCESS)
-               ret =  PCI_ERS_RESULT_RECOVERED;
+       ha->isp_ops->abort_isp(base_vha);
        clear_bit(ABORT_ISP_ACTIVE, &base_vha->dpc_flags);
 
+       if (qla2x00_isp_reg_stat(ha)) {
+               ha->flags.eeh_busy = 1;
+               qla_pci_error_cleanup(base_vha);
+               ql_log(ql_log_warn, base_vha, 0x9005,
+                      "Device unable to recover from PCI error.\n");
+       } else {
+               ret =  PCI_ERS_RESULT_RECOVERED;
+       }
 
 exit_slot_reset:
        ql_dbg(ql_dbg_aer, base_vha, 0x900e,
-           "slot_reset return %x.\n", ret);
+           "Slot Reset returning %x.\n", ret);
 
        return ret;
 }
@@ -7591,16 +7587,55 @@ qla2xxx_pci_resume(struct pci_dev *pdev)
        struct qla_hw_data *ha = base_vha->hw;
        int ret;
 
-       ql_dbg(ql_dbg_aer, base_vha, 0x900f,
-           "pci_resume.\n");
+       ql_log(ql_log_warn, base_vha, 0x900f,
+              "Pci Resume.\n");
 
-       ha->flags.eeh_busy = 0;
 
        ret = qla2x00_wait_for_hba_online(base_vha);
        if (ret != QLA_SUCCESS) {
                ql_log(ql_log_fatal, base_vha, 0x9002,
                    "The device failed to resume I/O from slot/link_reset.\n");
        }
+       ha->pci_error_state = QLA_PCI_RESUME;
+       ql_dbg(ql_dbg_aer, base_vha, 0x600d,
+              "Pci Resume returning.\n");
+}
+
+void qla_pci_set_eeh_busy(struct scsi_qla_host *vha)
+{
+       struct qla_hw_data *ha = vha->hw;
+       struct scsi_qla_host *base_vha = pci_get_drvdata(ha->pdev);
+       bool do_cleanup = false;
+       unsigned long flags;
+
+       if (ha->flags.eeh_busy)
+               return;
+
+       spin_lock_irqsave(&base_vha->work_lock, flags);
+       if (!ha->flags.eeh_busy) {
+               ha->flags.eeh_busy = 1;
+               do_cleanup = true;
+       }
+       spin_unlock_irqrestore(&base_vha->work_lock, flags);
+
+       if (do_cleanup)
+               qla_pci_error_cleanup(base_vha);
+}
+
+/*
+ * this routine will schedule a task to pause IO from interrupt context
+ * if caller sees a PCIE error event (register read = 0xf's)
+ */
+void qla_schedule_eeh_work(struct scsi_qla_host *vha)
+{
+       struct qla_hw_data *ha = vha->hw;
+       struct scsi_qla_host *base_vha = pci_get_drvdata(ha->pdev);
+
+       if (ha->flags.eeh_busy)
+               return;
+
+       set_bit(DO_EEH_RECOVERY, &base_vha->dpc_flags);
+       qla2xxx_wake_dpc(base_vha);
 }
 
 static void