IB/ipoib: Get rid of the sysfs_mutex
authorJason Gunthorpe <jgg@mellanox.com>
Sun, 29 Jul 2018 08:34:57 +0000 (11:34 +0300)
committerJason Gunthorpe <jgg@mellanox.com>
Fri, 3 Aug 2018 02:27:43 +0000 (20:27 -0600)
This mutex was introduced to deal with the deadlock formed by calling
unregister_netdev from within the sysfs callback of a netdev.

Now that we have priv_destructor and needs_free_netdev we can switch
to the more targeted solution of running the unregister from a
work queue. This avoids the deadlock and gets rid of the mutex.

The next patch in the series needs this mutex eliminated to create
atomicity of unregisteration.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
drivers/infiniband/ulp/ipoib/ipoib.h
drivers/infiniband/ulp/ipoib/ipoib_cm.c
drivers/infiniband/ulp/ipoib/ipoib_main.c
drivers/infiniband/ulp/ipoib/ipoib_vlan.c

index d2cb0a8500e3c2feff8a943ea681565080763ffd..804cb4bee57d981f98c2c008f8ec3e7a4a75b036 100644 (file)
@@ -332,7 +332,6 @@ struct ipoib_dev_priv {
 
        struct rw_semaphore vlan_rwsem;
        struct mutex mcast_mutex;
-       struct mutex sysfs_mutex;
 
        struct rb_root  path_tree;
        struct list_head path_list;
index 16ea08dc59a8d93eac137bb8b30abf39c1358eaf..ea01b8dd2be606206193408ca7bf9c42e4ea4e39 100644 (file)
@@ -1517,19 +1517,13 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr,
 {
        struct net_device *dev = to_net_dev(d);
        int ret;
-       struct ipoib_dev_priv *priv = ipoib_priv(dev);
-
-       if (!mutex_trylock(&priv->sysfs_mutex))
-               return restart_syscall();
 
        if (!rtnl_trylock()) {
-               mutex_unlock(&priv->sysfs_mutex);
                return restart_syscall();
        }
 
        if (dev->reg_state != NETREG_REGISTERED) {
                rtnl_unlock();
-               mutex_unlock(&priv->sysfs_mutex);
                return -EPERM;
        }
 
@@ -1541,7 +1535,6 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr,
         */
        if (ret != -EBUSY)
                rtnl_unlock();
-       mutex_unlock(&priv->sysfs_mutex);
 
        return (!ret || ret == -EBUSY) ? count : ret;
 }
index 73d917d57f93456115756da837adb7086e725f99..e9f4f261fe206dc10c77ef991256c611bd88164e 100644 (file)
@@ -2079,7 +2079,6 @@ static void ipoib_build_priv(struct net_device *dev)
        spin_lock_init(&priv->lock);
        init_rwsem(&priv->vlan_rwsem);
        mutex_init(&priv->mcast_mutex);
-       mutex_init(&priv->sysfs_mutex);
 
        INIT_LIST_HEAD(&priv->path_list);
        INIT_LIST_HEAD(&priv->child_intfs);
@@ -2476,10 +2475,7 @@ static void ipoib_remove_one(struct ib_device *device, void *client_data)
        list_for_each_entry_safe(priv, tmp, dev_list, list) {
                ipoib_parent_unregister_pre(priv->dev);
 
-               /* Wrap rtnl_lock/unlock with mutex to protect sysfs calls */
-               mutex_lock(&priv->sysfs_mutex);
                unregister_netdev(priv->dev);
-               mutex_unlock(&priv->sysfs_mutex);
        }
 
        kfree(dev_list);
@@ -2527,8 +2523,7 @@ static int __init ipoib_init_module(void)
         * its private workqueue, and we only queue up flush events
         * on our global flush workqueue.  This avoids the deadlocks.
         */
-       ipoib_workqueue = alloc_ordered_workqueue("ipoib_flush",
-                                                 WQ_MEM_RECLAIM);
+       ipoib_workqueue = alloc_ordered_workqueue("ipoib_flush", 0);
        if (!ipoib_workqueue) {
                ret = -ENOMEM;
                goto err_fs;
index 7776334cf8c59ff2a7eca9543a2e3eec67e2f2b2..891c5b40018af3f13cac44f11e9486229a41d587 100644 (file)
@@ -125,23 +125,16 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
        snprintf(intf_name, sizeof(intf_name), "%s.%04x",
                 ppriv->dev->name, pkey);
 
-       if (!mutex_trylock(&ppriv->sysfs_mutex))
+       if (!rtnl_trylock())
                return restart_syscall();
 
-       if (!rtnl_trylock()) {
-               mutex_unlock(&ppriv->sysfs_mutex);
-               return restart_syscall();
-       }
-
        if (pdev->reg_state != NETREG_REGISTERED) {
                rtnl_unlock();
-               mutex_unlock(&ppriv->sysfs_mutex);
                return -EPERM;
        }
 
        if (!down_write_trylock(&ppriv->vlan_rwsem)) {
                rtnl_unlock();
-               mutex_unlock(&ppriv->sysfs_mutex);
                return restart_syscall();
        }
 
@@ -178,58 +171,95 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
 out:
        up_write(&ppriv->vlan_rwsem);
        rtnl_unlock();
-       mutex_unlock(&ppriv->sysfs_mutex);
 
        return result;
 }
 
+struct ipoib_vlan_delete_work {
+       struct work_struct work;
+       struct net_device *dev;
+};
+
+/*
+ * sysfs callbacks of a netdevice cannot obtain the rtnl lock as
+ * unregister_netdev ultimately deletes the sysfs files while holding the rtnl
+ * lock. This deadlocks the system.
+ *
+ * A callback can use rtnl_trylock to avoid the deadlock but it cannot call
+ * unregister_netdev as that internally takes and releases the rtnl_lock.  So
+ * instead we find the netdev to unregister and then do the actual unregister
+ * from the global work queue where we can obtain the rtnl_lock safely.
+ */
+static void ipoib_vlan_delete_task(struct work_struct *work)
+{
+       struct ipoib_vlan_delete_work *pwork =
+               container_of(work, struct ipoib_vlan_delete_work, work);
+       struct net_device *dev = pwork->dev;
+
+       rtnl_lock();
+
+       /* Unregistering tasks can race with another task or parent removal */
+       if (dev->reg_state == NETREG_REGISTERED) {
+               struct ipoib_dev_priv *priv = ipoib_priv(dev);
+               struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
+
+               down_write(&ppriv->vlan_rwsem);
+               list_del(&priv->list);
+               up_write(&ppriv->vlan_rwsem);
+
+               ipoib_dbg(ppriv, "delete child vlan %s\n", dev->name);
+               unregister_netdevice(dev);
+       }
+
+       rtnl_unlock();
+
+       kfree(pwork);
+}
+
 int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey)
 {
        struct ipoib_dev_priv *ppriv, *priv, *tpriv;
-       struct net_device *dev = NULL;
+       int rc;
 
        if (!capable(CAP_NET_ADMIN))
                return -EPERM;
 
-       ppriv = ipoib_priv(pdev);
-
-       if (!mutex_trylock(&ppriv->sysfs_mutex))
+       if (!rtnl_trylock())
                return restart_syscall();
 
-       if (!rtnl_trylock()) {
-               mutex_unlock(&ppriv->sysfs_mutex);
-               return restart_syscall();
-       }
-
        if (pdev->reg_state != NETREG_REGISTERED) {
                rtnl_unlock();
-               mutex_unlock(&ppriv->sysfs_mutex);
                return -EPERM;
        }
 
-       if (!down_write_trylock(&ppriv->vlan_rwsem)) {
-               rtnl_unlock();
-               mutex_unlock(&ppriv->sysfs_mutex);
-               return restart_syscall();
-       }
+       ppriv = ipoib_priv(pdev);
 
+       rc = -ENODEV;
        list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) {
                if (priv->pkey == pkey &&
                    priv->child_type == IPOIB_LEGACY_CHILD) {
-                       list_del(&priv->list);
-                       dev = priv->dev;
+                       struct ipoib_vlan_delete_work *work;
+
+                       work = kmalloc(sizeof(*work), GFP_KERNEL);
+                       if (!work) {
+                               rc = -ENOMEM;
+                               goto out;
+                       }
+
+                       down_write(&ppriv->vlan_rwsem);
+                       list_del_init(&priv->list);
+                       up_write(&ppriv->vlan_rwsem);
+                       work->dev = priv->dev;
+                       INIT_WORK(&work->work, ipoib_vlan_delete_task);
+                       queue_work(ipoib_workqueue, &work->work);
+
+                       rc = 0;
                        break;
                }
        }
-       up_write(&ppriv->vlan_rwsem);
-
-       if (dev) {
-               ipoib_dbg(ppriv, "delete child vlan %s\n", dev->name);
-               unregister_netdevice(dev);
-       }
 
+out:
        rtnl_unlock();
-       mutex_unlock(&ppriv->sysfs_mutex);
 
-       return (dev) ? 0 : -ENODEV;
+       return rc;
 }