RDMA/mlx5: Support plane device and driver APIs to add and delete it
authorMark Zhang <markzhang@nvidia.com>
Sun, 16 Jun 2024 16:08:39 +0000 (19:08 +0300)
committerLeon Romanovsky <leonro@nvidia.com>
Mon, 1 Jul 2024 12:38:05 +0000 (15:38 +0300)
This patch supports driver APIs "add_sub_dev" and "del_sub_dev", to
add and delete a plane device respectively.
A mlx5 plane device is a rdma SMI device; It provides the SMI capability
through user MAD for it's parent, the logical multi-plane aggregated
device. For a plane port:
- It supports QP0 only;
- When adding a plane device, all plane ports are added;
- For some commands like mad_ifc, both plane_index and native portnum
  is needed;
- When querying or modifying a plane port context, the native portnum
  must be used, as the query/modify_hca_vport_context command doesn't
  support plane port.

Signed-off-by: Mark Zhang <markzhang@nvidia.com>
Link: https://lore.kernel.org/r/e933cd0562aece181f8657af2ca0f5b387d0f14e.1718553901.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
drivers/infiniband/hw/mlx5/cmd.c
drivers/infiniband/hw/mlx5/cmd.h
drivers/infiniband/hw/mlx5/mad.c
drivers/infiniband/hw/mlx5/main.c
drivers/infiniband/hw/mlx5/mlx5_ib.h
drivers/infiniband/hw/mlx5/qp.c
drivers/infiniband/hw/mlx5/qpc.c

index 1d0c8d5e745bfbf427f81ecefe75732c49ed8e0c..895b62cc528df6e73f64acd46aea34d56e23c608 100644 (file)
@@ -177,7 +177,7 @@ int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid)
        return mlx5_cmd_exec_in(dev, dealloc_xrcd, in);
 }
 
-int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb,
+int mlx5_cmd_mad_ifc(struct mlx5_ib_dev *dev, const void *inb, void *outb,
                     u16 opmod, u8 port)
 {
        int outlen = MLX5_ST_SZ_BYTES(mad_ifc_out);
@@ -195,12 +195,18 @@ int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb,
 
        MLX5_SET(mad_ifc_in, in, opcode, MLX5_CMD_OP_MAD_IFC);
        MLX5_SET(mad_ifc_in, in, op_mod, opmod);
-       MLX5_SET(mad_ifc_in, in, port, port);
+       if (dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) {
+               MLX5_SET(mad_ifc_in, in, plane_index, port);
+               MLX5_SET(mad_ifc_in, in, port,
+                        smi_to_native_portnum(dev, port));
+       } else {
+               MLX5_SET(mad_ifc_in, in, port, port);
+       }
 
        data = MLX5_ADDR_OF(mad_ifc_in, in, mad);
        memcpy(data, inb, MLX5_FLD_SZ_BYTES(mad_ifc_in, mad));
 
-       err = mlx5_cmd_exec_inout(dev, mad_ifc, in, out);
+       err = mlx5_cmd_exec_inout(dev->mdev, mad_ifc, in, out);
        if (err)
                goto out;
 
index 93a971a40d119832b76b9bfa8864af01e5782fdf..e5cd31270443d8f8e7801247d1dad5e8982fd42a 100644 (file)
@@ -54,7 +54,7 @@ int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid,
                        u32 qpn, u16 uid);
 int mlx5_cmd_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn, u16 uid);
 int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid);
-int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb,
+int mlx5_cmd_mad_ifc(struct mlx5_ib_dev *dev, const void *inb, void *outb,
                     u16 opmod, u8 port);
 int mlx5_cmd_uar_alloc(struct mlx5_core_dev *dev, u32 *uarn, u16 uid);
 int mlx5_cmd_uar_dealloc(struct mlx5_core_dev *dev, u32 uarn, u16 uid);
index 3e43687a7f6f7313f4031955b1ad03fbc22b8bf9..ead836d159d3694777fb71ab41eb4995fa44af84 100644 (file)
@@ -69,7 +69,7 @@ static int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey,
        if (ignore_bkey || !in_wc)
                op_modifier |= 0x2;
 
-       return mlx5_cmd_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier,
+       return mlx5_cmd_mad_ifc(dev, in_mad, response_mad, op_modifier,
                                port);
 }
 
index 4fed8d1ed819a731f0af0de4e56773f157e851da..8a1a43700b45df06c8e0028b6d11a40cf34cf378 100644 (file)
@@ -282,6 +282,14 @@ struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev,
        struct mlx5_ib_multiport_info *mpi;
        struct mlx5_ib_port *port;
 
+       if (ibdev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) {
+               if (native_port_num)
+                       *native_port_num = smi_to_native_portnum(ibdev,
+                                                                ib_port_num);
+               return ibdev->mdev;
+
+       }
+
        if (!mlx5_core_mp_enabled(ibdev->mdev) ||
            ll != IB_LINK_LAYER_ETHERNET) {
                if (native_port_num)
@@ -1347,6 +1355,9 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port,
 
        /* props being zeroed by the caller, avoid zeroing it here */
 
+       if (ibdev->type == RDMA_DEVICE_TYPE_SMI)
+               port = smi_to_native_portnum(dev, port);
+
        err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep);
        if (err)
                goto out;
@@ -1362,7 +1373,8 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port,
        if (dev->num_plane) {
                props->port_cap_flags |= IB_PORT_SM_DISABLED;
                props->port_cap_flags &= ~IB_PORT_SM;
-       }
+       } else if (ibdev->type == RDMA_DEVICE_TYPE_SMI)
+               props->port_cap_flags &= ~IB_PORT_CM_SUP;
 
        props->gid_tbl_len      = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
        props->max_msg_sz       = 1 << MLX5_CAP_GEN(mdev, log_max_msg);
@@ -2812,7 +2824,8 @@ static int set_has_smi_cap(struct mlx5_ib_dev *dev)
                if (dev->num_plane) {
                        dev->port_caps[port - 1].has_smi = false;
                        continue;
-               } else if (!MLX5_CAP_GEN(dev->mdev, ib_virt)) {
+               } else if (!MLX5_CAP_GEN(dev->mdev, ib_virt) ||
+                       dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) {
                        dev->port_caps[port - 1].has_smi = true;
                        continue;
                }
@@ -3026,6 +3039,8 @@ static u32 get_core_cap_flags(struct ib_device *ibdev,
                return ret | RDMA_CORE_CAP_PROT_IB | RDMA_CORE_CAP_IB_MAD |
                        RDMA_CORE_CAP_IB_CM | RDMA_CORE_CAP_IB_SA |
                        RDMA_CORE_CAP_AF_IB;
+       else if (ibdev->type == RDMA_DEVICE_TYPE_SMI)
+               return ret | RDMA_CORE_CAP_IB_MAD | RDMA_CORE_CAP_IB_SMI;
 
        if (ll == IB_LINK_LAYER_INFINIBAND)
                return ret | RDMA_CORE_PORT_IBA_IB;
@@ -3062,6 +3077,9 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u32 port_num,
                return err;
 
        if (ll == IB_LINK_LAYER_INFINIBAND) {
+               if (ibdev->type == RDMA_DEVICE_TYPE_SMI)
+                       port_num = smi_to_native_portnum(dev, port_num);
+
                err = mlx5_query_hca_vport_context(dev->mdev, 0, port_num, 0,
                                                   &rep);
                if (err)
@@ -3862,12 +3880,18 @@ err_mp:
        return err;
 }
 
+static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent,
+                                            enum rdma_nl_dev_type type,
+                                            const char *name);
+static void mlx5_ib_del_sub_dev(struct ib_device *sub_dev);
+
 static const struct ib_device_ops mlx5_ib_dev_ops = {
        .owner = THIS_MODULE,
        .driver_id = RDMA_DRIVER_MLX5,
        .uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION,
 
        .add_gid = mlx5_ib_add_gid,
+       .add_sub_dev = mlx5_ib_add_sub_dev,
        .alloc_mr = mlx5_ib_alloc_mr,
        .alloc_mr_integrity = mlx5_ib_alloc_mr_integrity,
        .alloc_pd = mlx5_ib_alloc_pd,
@@ -3882,6 +3906,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
        .dealloc_pd = mlx5_ib_dealloc_pd,
        .dealloc_ucontext = mlx5_ib_dealloc_ucontext,
        .del_gid = mlx5_ib_del_gid,
+       .del_sub_dev = mlx5_ib_del_sub_dev,
        .dereg_mr = mlx5_ib_dereg_mr,
        .destroy_ah = mlx5_ib_destroy_ah,
        .destroy_cq = mlx5_ib_destroy_cq,
@@ -4171,7 +4196,9 @@ static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
 {
        const char *name;
 
-       if (!mlx5_lag_is_active(dev->mdev))
+       if (dev->sub_dev_name)
+               name = dev->sub_dev_name;
+       else if (!mlx5_lag_is_active(dev->mdev))
                name = "mlx5_%d";
        else
                name = "mlx5_bond_%d";
@@ -4432,6 +4459,89 @@ const struct mlx5_ib_profile raw_eth_profile = {
                     NULL),
 };
 
+static const struct mlx5_ib_profile plane_profile = {
+       STAGE_CREATE(MLX5_IB_STAGE_INIT,
+                    mlx5_ib_stage_init_init,
+                    mlx5_ib_stage_init_cleanup),
+       STAGE_CREATE(MLX5_IB_STAGE_CAPS,
+                    mlx5_ib_stage_caps_init,
+                    mlx5_ib_stage_caps_cleanup),
+       STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
+                    mlx5_ib_stage_non_default_cb,
+                    NULL),
+       STAGE_CREATE(MLX5_IB_STAGE_QP,
+                    mlx5_init_qp_table,
+                    mlx5_cleanup_qp_table),
+       STAGE_CREATE(MLX5_IB_STAGE_SRQ,
+                    mlx5_init_srq_table,
+                    mlx5_cleanup_srq_table),
+       STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
+                    mlx5_ib_dev_res_init,
+                    mlx5_ib_dev_res_cleanup),
+       STAGE_CREATE(MLX5_IB_STAGE_BFREG,
+                    mlx5_ib_stage_bfrag_init,
+                    mlx5_ib_stage_bfrag_cleanup),
+       STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
+                    mlx5_ib_stage_ib_reg_init,
+                    mlx5_ib_stage_ib_reg_cleanup),
+};
+
+static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent,
+                                            enum rdma_nl_dev_type type,
+                                            const char *name)
+{
+       struct mlx5_ib_dev *mparent = to_mdev(parent), *mplane;
+       enum rdma_link_layer ll;
+       int ret;
+
+       if (mparent->smi_dev)
+               return ERR_PTR(-EEXIST);
+
+       ll = mlx5_port_type_cap_to_rdma_ll(MLX5_CAP_GEN(mparent->mdev,
+                                                       port_type));
+       if (type != RDMA_DEVICE_TYPE_SMI || !mparent->num_plane ||
+           ll != IB_LINK_LAYER_INFINIBAND ||
+           !MLX5_CAP_GEN_2(mparent->mdev, multiplane_qp_ud))
+               return ERR_PTR(-EOPNOTSUPP);
+
+       mplane = ib_alloc_device(mlx5_ib_dev, ib_dev);
+       if (!mplane)
+               return ERR_PTR(-ENOMEM);
+
+       mplane->port = kcalloc(mparent->num_plane * mparent->num_ports,
+                              sizeof(*mplane->port), GFP_KERNEL);
+       if (!mplane->port) {
+               ret = -ENOMEM;
+               goto fail_kcalloc;
+       }
+
+       mplane->ib_dev.type = type;
+       mplane->mdev = mparent->mdev;
+       mplane->num_ports = mparent->num_plane;
+       mplane->sub_dev_name = name;
+
+       ret = __mlx5_ib_add(mplane, &plane_profile);
+       if (ret)
+               goto fail_ib_add;
+
+       mparent->smi_dev = mplane;
+       return &mplane->ib_dev;
+
+fail_ib_add:
+       kfree(mplane->port);
+fail_kcalloc:
+       ib_dealloc_device(&mplane->ib_dev);
+       return ERR_PTR(ret);
+}
+
+static void mlx5_ib_del_sub_dev(struct ib_device *sub_dev)
+{
+       struct mlx5_ib_dev *mdev = to_mdev(sub_dev);
+
+       to_mdev(sub_dev->parent)->smi_dev = NULL;
+       __mlx5_ib_remove(mdev, mdev->profile, MLX5_IB_STAGE_MAX);
+}
+
 static int mlx5r_mp_probe(struct auxiliary_device *adev,
                          const struct auxiliary_device_id *id)
 {
index 49a5eebe69b8fa6bfa4928a1d2deaf4ecf216cc9..d5eb1b726675d222cc533411c782a0912041b4a7 100644 (file)
@@ -1191,6 +1191,8 @@ struct mlx5_ib_dev {
 #endif
 
        u8 num_plane;
+       struct mlx5_ib_dev *smi_dev;
+       const char *sub_dev_name;
 };
 
 static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
@@ -1699,4 +1701,10 @@ static inline bool mlx5_umem_needs_ats(struct mlx5_ib_dev *dev,
 int set_roce_addr(struct mlx5_ib_dev *dev, u32 port_num,
                  unsigned int index, const union ib_gid *gid,
                  const struct ib_gid_attr *attr);
+
+static inline u32 smi_to_native_portnum(struct mlx5_ib_dev *dev, u32 port)
+{
+       return (port - 1) / dev->num_ports + 1;
+}
+
 #endif /* MLX5_IB_H */
index 079ad927182fbe185a7aae21ac676edc6bf28bc8..e39b1a101e9721b92b84b8e1135dbf8f1158ed41 100644 (file)
@@ -4217,7 +4217,12 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 
        /* todo implement counter_index functionality */
 
-       if (is_sqp(qp->type))
+       if (dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI && is_qp0(qp->type)) {
+               MLX5_SET(ads, pri_path, vhca_port_num,
+                        smi_to_native_portnum(dev, qp->port));
+               if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR)
+                       MLX5_SET(ads, pri_path, plane_index, qp->port);
+       } else if (is_sqp(qp->type))
                MLX5_SET(ads, pri_path, vhca_port_num, qp->port);
 
        if (attr_mask & IB_QP_PORT)
index d9cf6982d645ed38fd084dfdca1e74a0564c9156..d3dcc272200afa95cc430f171fecbdeb6fcd09fc 100644 (file)
@@ -249,7 +249,8 @@ int mlx5_qpc_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp,
        if (err)
                goto err_cmd;
 
-       mlx5_debug_qp_add(dev->mdev, qp);
+       if (dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI)
+               mlx5_debug_qp_add(dev->mdev, qp);
 
        return 0;
 
@@ -307,7 +308,8 @@ int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp)
 {
        u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
 
-       mlx5_debug_qp_remove(dev->mdev, qp);
+       if (dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI)
+               mlx5_debug_qp_remove(dev->mdev, qp);
 
        destroy_resource_common(dev, qp);
 
@@ -504,7 +506,9 @@ int mlx5_init_qp_table(struct mlx5_ib_dev *dev)
        spin_lock_init(&table->lock);
        INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
        xa_init(&table->dct_xa);
-       mlx5_qp_debugfs_init(dev->mdev);
+
+       if (dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI)
+               mlx5_qp_debugfs_init(dev->mdev);
 
        table->nb.notifier_call = rsc_event_notifier;
        mlx5_notifier_register(dev->mdev, &table->nb);
@@ -517,7 +521,8 @@ void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev)
        struct mlx5_qp_table *table = &dev->qp_table;
 
        mlx5_notifier_unregister(dev->mdev, &table->nb);
-       mlx5_qp_debugfs_cleanup(dev->mdev);
+       if (dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI)
+               mlx5_qp_debugfs_cleanup(dev->mdev);
 }
 
 int mlx5_core_qp_query(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp,