IB/IPoIB: Enqueue separate work_structs for each flushed interface
authorCosmin Ratiu <cratiu@nvidia.com>
Wed, 21 May 2025 12:08:58 +0000 (15:08 +0300)
committerJakub Kicinski <kuba@kernel.org>
Thu, 22 May 2025 16:15:04 +0000 (09:15 -0700)
Previously, flushing a netdevice involved first flushing all child
devices from the flush task itself. That requires holding the lock that
protects the list for the entire duration of the flush.

This poses a problem when converting from vlan_rwsem to the netdev
instance lock (next patch), because holding the parent lock while
trying to acquire a child lock makes lockdep unhappy, rightfully.

Fix this by splitting a big flush task into individual flush tasks
(all are already created in their respective ipoib_dev_priv structs)
and defining a helper function to enqueue all of them while holding the
list lock.

In ipoib_set_mac, the function is not used and the task is enqueued
directly, because in the subsequent patches locking is changed and this
function may be called with the netdev instance lock held.

This is effectively a noop, the wq is single-threaded and ordered and
will execute the same flush operations in the same order as before.

Furthermore, there should be no new races because
ipoib_parent_unregister_pre() calls flush_workqueue() after stopping new
work generation to wait for pending work to complete. flush_workqueue()
waits for all currently enqueued work to finish before returning.

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1747829342-1018757-2-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
drivers/infiniband/ulp/ipoib/ipoib.h
drivers/infiniband/ulp/ipoib/ipoib_ib.c
drivers/infiniband/ulp/ipoib/ipoib_main.c
drivers/infiniband/ulp/ipoib/ipoib_verbs.c

index abe0522b7df46a0c5cd6a2c9bcd2715b7628d819..2e05e9c9317dd4a4fd0be675103a9a8dfbe32bb2 100644 (file)
@@ -512,6 +512,8 @@ int ipoib_intf_init(struct ib_device *hca, u32 port, const char *format,
 void ipoib_ib_dev_flush_light(struct work_struct *work);
 void ipoib_ib_dev_flush_normal(struct work_struct *work);
 void ipoib_ib_dev_flush_heavy(struct work_struct *work);
+void ipoib_queue_work(struct ipoib_dev_priv *priv,
+                     enum ipoib_flush_level level);
 void ipoib_ib_tx_timeout_work(struct work_struct *work);
 void ipoib_ib_dev_cleanup(struct net_device *dev);
 
index 5cde275daa9416c4feae01bf7b92344779c13ed8..e0e7f600097db8941f6ea52d0c7f0620ff96d00d 100644 (file)
@@ -1172,24 +1172,11 @@ out:
 }
 
 static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
-                               enum ipoib_flush_level level,
-                               int nesting)
+                               enum ipoib_flush_level level)
 {
-       struct ipoib_dev_priv *cpriv;
        struct net_device *dev = priv->dev;
        int result;
 
-       down_read_nested(&priv->vlan_rwsem, nesting);
-
-       /*
-        * Flush any child interfaces too -- they might be up even if
-        * the parent is down.
-        */
-       list_for_each_entry(cpriv, &priv->child_intfs, list)
-               __ipoib_ib_dev_flush(cpriv, level, nesting + 1);
-
-       up_read(&priv->vlan_rwsem);
-
        if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) &&
            level != IPOIB_FLUSH_HEAVY) {
                /* Make sure the dev_addr is set even if not flushing */
@@ -1280,7 +1267,7 @@ void ipoib_ib_dev_flush_light(struct work_struct *work)
        struct ipoib_dev_priv *priv =
                container_of(work, struct ipoib_dev_priv, flush_light);
 
-       __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT, 0);
+       __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT);
 }
 
 void ipoib_ib_dev_flush_normal(struct work_struct *work)
@@ -1288,7 +1275,7 @@ void ipoib_ib_dev_flush_normal(struct work_struct *work)
        struct ipoib_dev_priv *priv =
                container_of(work, struct ipoib_dev_priv, flush_normal);
 
-       __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL, 0);
+       __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL);
 }
 
 void ipoib_ib_dev_flush_heavy(struct work_struct *work)
@@ -1297,10 +1284,35 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work)
                container_of(work, struct ipoib_dev_priv, flush_heavy);
 
        rtnl_lock();
-       __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY, 0);
+       __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY);
        rtnl_unlock();
 }
 
+void ipoib_queue_work(struct ipoib_dev_priv *priv,
+                     enum ipoib_flush_level level)
+{
+       if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+               struct ipoib_dev_priv *cpriv;
+
+               down_read(&priv->vlan_rwsem);
+               list_for_each_entry(cpriv, &priv->child_intfs, list)
+                       ipoib_queue_work(cpriv, level);
+               up_read(&priv->vlan_rwsem);
+       }
+
+       switch (level) {
+       case IPOIB_FLUSH_LIGHT:
+               queue_work(ipoib_workqueue, &priv->flush_light);
+               break;
+       case IPOIB_FLUSH_NORMAL:
+               queue_work(ipoib_workqueue, &priv->flush_normal);
+               break;
+       case IPOIB_FLUSH_HEAVY:
+               queue_work(ipoib_workqueue, &priv->flush_heavy);
+               break;
+       }
+}
+
 void ipoib_ib_dev_cleanup(struct net_device *dev)
 {
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
index 3b463db8ce39d977ddd2a8e3d5bd648d90354d9a..55b1f3cbee1767161581e2d256aa6111a63e86d1 100644 (file)
@@ -2415,6 +2415,14 @@ static int ipoib_set_mac(struct net_device *dev, void *addr)
 
        set_base_guid(priv, (union ib_gid *)(ss->__data + 4));
 
+       if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+               struct ipoib_dev_priv *cpriv;
+
+               down_read(&priv->vlan_rwsem);
+               list_for_each_entry(cpriv, &priv->child_intfs, list)
+                       queue_work(ipoib_workqueue, &cpriv->flush_light);
+               up_read(&priv->vlan_rwsem);
+       }
        queue_work(ipoib_workqueue, &priv->flush_light);
 
        return 0;
@@ -2526,7 +2534,7 @@ static struct net_device *ipoib_add_port(const char *format,
        ib_register_event_handler(&priv->event_handler);
 
        /* call event handler to ensure pkey in sync */
-       queue_work(ipoib_workqueue, &priv->flush_heavy);
+       ipoib_queue_work(priv, IPOIB_FLUSH_HEAVY);
 
        ndev->rtnl_link_ops = ipoib_get_link_ops();
 
index 368e5d77416de9d1757846ad9cab034561b23fa2..86983080d28bffaf8342ab8616033cb3f408d71b 100644 (file)
@@ -280,15 +280,15 @@ void ipoib_event(struct ib_event_handler *handler,
                  dev_name(&record->device->dev), record->element.port_num);
 
        if (record->event == IB_EVENT_CLIENT_REREGISTER) {
-               queue_work(ipoib_workqueue, &priv->flush_light);
+               ipoib_queue_work(priv, IPOIB_FLUSH_LIGHT);
        } else if (record->event == IB_EVENT_PORT_ERR ||
                   record->event == IB_EVENT_PORT_ACTIVE ||
                   record->event == IB_EVENT_LID_CHANGE) {
-               queue_work(ipoib_workqueue, &priv->flush_normal);
+               ipoib_queue_work(priv, IPOIB_FLUSH_NORMAL);
        } else if (record->event == IB_EVENT_PKEY_CHANGE) {
-               queue_work(ipoib_workqueue, &priv->flush_heavy);
+               ipoib_queue_work(priv, IPOIB_FLUSH_HEAVY);
        } else if (record->event == IB_EVENT_GID_CHANGE &&
                   !test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) {
-               queue_work(ipoib_workqueue, &priv->flush_light);
+               ipoib_queue_work(priv, IPOIB_FLUSH_LIGHT);
        }
 }