IB/mthca: Don't allow userspace open while recovering from catastrophic error
authorJack Morgenstein <jackm@dev.mellanox.co.il>
Sun, 6 Sep 2009 03:36:16 +0000 (20:36 -0700)
committerRoland Dreier <rolandd@cisco.com>
Sun, 6 Sep 2009 03:36:16 +0000 (20:36 -0700)
Userspace apps are supposed to release all ib device resources if they
receive a fatal async event (IBV_EVENT_DEVICE_FATAL).  However, the
app has no way of knowing when the device has come back up, except to
repeatedly attempt ibv_open_device() until it succeeds.

However, currently there is no protection against the open succeeding
while the device is in being removed following the fatal event.  In
this case, the open will succeed, but as a result the device waits in
the middle of its removal until the new app releases its resources --
and the new app will not do so, since the open succeeded at a point
following the fatal event generation.

This patch adds an "active" flag to the device. The active flag is set
to false (in the fatal event flow) before the "fatal" event is
generated, so any subsequent ibv_dev_open() call to the device will
fail until the device comes back up, thus preventing the above
deadlock.

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
drivers/infiniband/hw/mthca/mthca_catas.c
drivers/infiniband/hw/mthca/mthca_dev.h
drivers/infiniband/hw/mthca/mthca_main.c
drivers/infiniband/hw/mthca/mthca_provider.c

index 65ad359fdf164e506ec9a5726ba12c5a5ee882f2..056b2a4c69700f9fdb7cb6640213804e5f850aec 100644 (file)
@@ -88,6 +88,7 @@ static void handle_catas(struct mthca_dev *dev)
        event.device = &dev->ib_dev;
        event.event  = IB_EVENT_DEVICE_FATAL;
        event.element.port_num = 0;
+       dev->active = false;
 
        ib_dispatch_event(&event);
 
index 9ef611f6dd36d52f531198d0a61d7ec11dbe78fd..7e6a6d64ad4eb1bee96b0d2d244daf20898ab3b6 100644 (file)
@@ -357,6 +357,7 @@ struct mthca_dev {
        struct ib_ah         *sm_ah[MTHCA_MAX_PORTS];
        spinlock_t            sm_lock;
        u8                    rate[MTHCA_MAX_PORTS];
+       bool                  active;
 };
 
 #ifdef CONFIG_INFINIBAND_MTHCA_DEBUG
index 13da9f1d24c0a2bf9d715f32701d0cb02406cba2..518cc540e516ca4c3863dc59e47c7b4347809b87 100644 (file)
@@ -1116,6 +1116,8 @@ static int __mthca_init_one(struct pci_dev *pdev, int hca_type)
        pci_set_drvdata(pdev, mdev);
        mdev->hca_type = hca_type;
 
+       mdev->active = true;
+
        return 0;
 
 err_unregister:
index 87ad889e367b2b6b39cfe010ec1ec28f1dd488ac..bcf7a401482015f3b5c09afdc4177bab3d10577f 100644 (file)
@@ -334,6 +334,9 @@ static struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev,
        struct mthca_ucontext           *context;
        int                              err;
 
+       if (!(to_mdev(ibdev)->active))
+               return ERR_PTR(-EAGAIN);
+
        memset(&uresp, 0, sizeof uresp);
 
        uresp.qp_tab_size = to_mdev(ibdev)->limits.num_qps;