net/smc: stop links when their GID is removed
[linux-block.git] / net / smc / smc_ib.c
index a8845343d183e86ae7a23ed4dcd7dd9b023a68df..d93055ec17ae86cbaee63cc35943a71a118d1a74 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/scatterlist.h>
 #include <linux/wait.h>
 #include <linux/mutex.h>
+#include <linux/inetdevice.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_cache.h>
 
@@ -62,16 +63,23 @@ static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
                IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN |
                IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER;
        struct ib_qp_attr qp_attr;
+       u8 hop_lim = 1;
 
        memset(&qp_attr, 0, sizeof(qp_attr));
        qp_attr.qp_state = IB_QPS_RTR;
        qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
        qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE;
        rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport);
-       rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, 1, 0);
+       if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway)
+               hop_lim = IPV6_DEFAULT_HOPLIMIT;
+       rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, hop_lim, 0);
        rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid);
-       memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac,
-              sizeof(lnk->peer_mac));
+       if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway)
+               memcpy(&qp_attr.ah_attr.roce.dmac, lnk->lgr->nexthop_mac,
+                      sizeof(lnk->lgr->nexthop_mac));
+       else
+               memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac,
+                      sizeof(lnk->peer_mac));
        qp_attr.dest_qp_num = lnk->peer_qpn;
        qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */
        qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming
@@ -183,9 +191,81 @@ bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
        return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
 }
 
+int smc_ib_find_route(__be32 saddr, __be32 daddr,
+                     u8 nexthop_mac[], u8 *uses_gateway)
+{
+       struct neighbour *neigh = NULL;
+       struct rtable *rt = NULL;
+       struct flowi4 fl4 = {
+               .saddr = saddr,
+               .daddr = daddr
+       };
+
+       if (daddr == cpu_to_be32(INADDR_NONE))
+               goto out;
+       rt = ip_route_output_flow(&init_net, &fl4, NULL);
+       if (IS_ERR(rt))
+               goto out;
+       if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET)
+               goto out;
+       neigh = rt->dst.ops->neigh_lookup(&rt->dst, NULL, &fl4.daddr);
+       if (neigh) {
+               memcpy(nexthop_mac, neigh->ha, ETH_ALEN);
+               *uses_gateway = rt->rt_uses_gateway;
+               return 0;
+       }
+out:
+       return -ENOENT;
+}
+
+static int smc_ib_determine_gid_rcu(const struct net_device *ndev,
+                                   const struct ib_gid_attr *attr,
+                                   u8 gid[], u8 *sgid_index,
+                                   struct smc_init_info_smcrv2 *smcrv2)
+{
+       if (!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) {
+               if (gid)
+                       memcpy(gid, &attr->gid, SMC_GID_SIZE);
+               if (sgid_index)
+                       *sgid_index = attr->index;
+               return 0;
+       }
+       if (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP &&
+           smc_ib_gid_to_ipv4((u8 *)&attr->gid) != cpu_to_be32(INADDR_NONE)) {
+               struct in_device *in_dev = __in_dev_get_rcu(ndev);
+               const struct in_ifaddr *ifa;
+               bool subnet_match = false;
+
+               if (!in_dev)
+                       goto out;
+               in_dev_for_each_ifa_rcu(ifa, in_dev) {
+                       if (!inet_ifa_match(smcrv2->saddr, ifa))
+                               continue;
+                       subnet_match = true;
+                       break;
+               }
+               if (!subnet_match)
+                       goto out;
+               if (smcrv2->daddr && smc_ib_find_route(smcrv2->saddr,
+                                                      smcrv2->daddr,
+                                                      smcrv2->nexthop_mac,
+                                                      &smcrv2->uses_gateway))
+                       goto out;
+
+               if (gid)
+                       memcpy(gid, &attr->gid, SMC_GID_SIZE);
+               if (sgid_index)
+                       *sgid_index = attr->index;
+               return 0;
+       }
+out:
+       return -ENODEV;
+}
+
 /* determine the gid for an ib-device port and vlan id */
 int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
-                        unsigned short vlan_id, u8 gid[], u8 *sgid_index)
+                        unsigned short vlan_id, u8 gid[], u8 *sgid_index,
+                        struct smc_init_info_smcrv2 *smcrv2)
 {
        const struct ib_gid_attr *attr;
        const struct net_device *ndev;
@@ -201,15 +281,13 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
                if (!IS_ERR(ndev) &&
                    ((!vlan_id && !is_vlan_dev(ndev)) ||
                     (vlan_id && is_vlan_dev(ndev) &&
-                     vlan_dev_vlan_id(ndev) == vlan_id)) &&
-                   attr->gid_type == IB_GID_TYPE_ROCE) {
-                       rcu_read_unlock();
-                       if (gid)
-                               memcpy(gid, &attr->gid, SMC_GID_SIZE);
-                       if (sgid_index)
-                               *sgid_index = attr->index;
-                       rdma_put_gid_attr(attr);
-                       return 0;
+                     vlan_dev_vlan_id(ndev) == vlan_id))) {
+                       if (!smc_ib_determine_gid_rcu(ndev, attr, gid,
+                                                     sgid_index, smcrv2)) {
+                               rcu_read_unlock();
+                               rdma_put_gid_attr(attr);
+                               return 0;
+                       }
                }
                rcu_read_unlock();
                rdma_put_gid_attr(attr);
@@ -217,6 +295,58 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
        return -ENODEV;
 }
 
+/* check if gid is still defined on smcibdev */
+static bool smc_ib_check_link_gid(u8 gid[SMC_GID_SIZE], bool smcrv2,
+                                 struct smc_ib_device *smcibdev, u8 ibport)
+{
+       const struct ib_gid_attr *attr;
+       bool rc = false;
+       int i;
+
+       for (i = 0; !rc && i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
+               attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i);
+               if (IS_ERR(attr))
+                       continue;
+
+               rcu_read_lock();
+               if ((!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) ||
+                   (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP &&
+                    !(ipv6_addr_type((const struct in6_addr *)&attr->gid)
+                                    & IPV6_ADDR_LINKLOCAL)))
+                       if (!memcmp(gid, &attr->gid, SMC_GID_SIZE))
+                               rc = true;
+               rcu_read_unlock();
+               rdma_put_gid_attr(attr);
+       }
+       return rc;
+}
+
+/* check all links if the gid is still defined on smcibdev */
+static void smc_ib_gid_check(struct smc_ib_device *smcibdev, u8 ibport)
+{
+       struct smc_link_group *lgr;
+       int i;
+
+       spin_lock_bh(&smc_lgr_list.lock);
+       list_for_each_entry(lgr, &smc_lgr_list.list, list) {
+               if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
+                           SMC_MAX_PNETID_LEN))
+                       continue; /* lgr is not affected */
+               if (list_empty(&lgr->list))
+                       continue;
+               for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+                       if (lgr->lnk[i].state == SMC_LNK_UNUSED ||
+                           lgr->lnk[i].smcibdev != smcibdev)
+                               continue;
+                       if (!smc_ib_check_link_gid(lgr->lnk[i].gid,
+                                                  lgr->smc_version == SMC_V2,
+                                                  smcibdev, ibport))
+                               smcr_port_err(smcibdev, ibport);
+               }
+       }
+       spin_unlock_bh(&smc_lgr_list.lock);
+}
+
 static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
 {
        int rc;
@@ -255,6 +385,7 @@ static void smc_ib_port_event_work(struct work_struct *work)
                } else {
                        clear_bit(port_idx, smcibdev->ports_going_away);
                        smcr_port_add(smcibdev, port_idx + 1);
+                       smc_ib_gid_check(smcibdev, port_idx + 1);
                }
        }
 }
@@ -523,6 +654,7 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk)
 /* create a queue pair within the protection domain for a link */
 int smc_ib_create_queue_pair(struct smc_link *lnk)
 {
+       int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
        struct ib_qp_init_attr qp_attr = {
                .event_handler = smc_ib_qp_event_handler,
                .qp_context = lnk,
@@ -536,7 +668,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)
                        .max_send_wr = SMC_WR_BUF_CNT * 3,
                        .max_recv_wr = SMC_WR_BUF_CNT * 3,
                        .max_send_sge = SMC_IB_MAX_SEND_SGE,
-                       .max_recv_sge = 1,
+                       .max_recv_sge = sges_per_buf,
                },
                .sq_sig_type = IB_SIGNAL_REQ_WR,
                .qp_type = IB_QPT_RC,