IB/ehca: Prevent RDMA-related connection failures on some eHCA2 hardware
authorJoachim Fenkes <fenkes@de.ibm.com>
Thu, 17 Jan 2008 14:07:24 +0000 (15:07 +0100)
committerRoland Dreier <rolandd@cisco.com>
Fri, 25 Jan 2008 22:15:44 +0000 (14:15 -0800)
Some HW revisions of eHCA2 may cause an RC connection to break if they
received RDMA Reads over that connection before.  This can be
prevented by assuring that, after the first RDMA Read, the QP receives
a new RDMA Read every few million link packets.

Include code into the driver that inserts an empty (size 0) RDMA Read
into the message stream every now and then if the consumer doesn't
post them frequently enough.

Signed-off-by: Joachim Fenkes <fenkes@de.ibm.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
drivers/infiniband/hw/ehca/ehca_classes.h
drivers/infiniband/hw/ehca/ehca_qp.c
drivers/infiniband/hw/ehca/ehca_reqs.c

index 2502366e845f8939ce2a27c4d44dacda09a04836..f281d16040f5ac794e14e28820b37e537a7db51f 100644 (file)
@@ -183,6 +183,11 @@ struct ehca_qp {
        u32 mm_count_squeue;
        u32 mm_count_rqueue;
        u32 mm_count_galpa;
+       /* unsolicited ack circumvention */
+       int unsol_ack_circ;
+       int mtu_shift;
+       u32 message_count;
+       u32 packet_count;
 };
 
 #define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ)
index 8d3c35fa051b8b04c46df2bc2d320758bbc4476f..1012f15a7140e11cf579548069a80f890389ce6a 100644 (file)
@@ -592,10 +592,8 @@ static struct ehca_qp *internal_create_qp(
                goto create_qp_exit1;
        }
 
-       if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
-               parms.sigtype = HCALL_SIGT_EVERY;
-       else
-               parms.sigtype = HCALL_SIGT_BY_WQE;
+       /* Always signal by WQE so we can hide circ. WQEs */
+       parms.sigtype = HCALL_SIGT_BY_WQE;
 
        /* UD_AV CIRCUMVENTION */
        max_send_sge = init_attr->cap.max_send_sge;
@@ -618,6 +616,10 @@ static struct ehca_qp *internal_create_qp(
        parms.squeue.max_sge = max_send_sge;
        parms.rqueue.max_sge = max_recv_sge;
 
+       /* RC QPs need one more SWQE for unsolicited ack circumvention */
+       if (qp_type == IB_QPT_RC)
+               parms.squeue.max_wr++;
+
        if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap)) {
                if (HAS_SQ(my_qp))
                        ehca_determine_small_queue(
@@ -650,6 +652,8 @@ static struct ehca_qp *internal_create_qp(
                        parms.squeue.act_nr_sges = 1;
                        parms.rqueue.act_nr_sges = 1;
                }
+               /* hide the extra WQE */
+               parms.squeue.act_nr_wqes--;
                break;
        case IB_QPT_UD:
        case IB_QPT_GSI:
@@ -1294,6 +1298,8 @@ static int internal_modify_qp(struct ib_qp *ibqp,
        }
 
        if (attr_mask & IB_QP_PATH_MTU) {
+               /* store ld(MTU) */
+               my_qp->mtu_shift = attr->path_mtu + 7;
                mqpcb->path_mtu = attr->path_mtu;
                update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PATH_MTU, 1);
        }
index ea91360835d3279a27b885a5fd045b7fc9fbfff7..3aacc8cf1e448391feb37accef997118c5ca6f08 100644 (file)
@@ -50,6 +50,9 @@
 #include "hcp_if.h"
 #include "hipz_fns.h"
 
+/* in RC traffic, insert an empty RDMA READ every this many packets */
+#define ACK_CIRC_THRESHOLD 2000000
+
 static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue,
                                  struct ehca_wqe *wqe_p,
                                  struct ib_recv_wr *recv_wr)
@@ -81,7 +84,7 @@ static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue,
        if (ehca_debug_level) {
                ehca_gen_dbg("RECEIVE WQE written into ipz_rqueue=%p",
                             ipz_rqueue);
-               ehca_dmp( wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "recv wqe");
+               ehca_dmp(wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "recv wqe");
        }
 
        return 0;
@@ -135,7 +138,8 @@ static void trace_send_wr_ud(const struct ib_send_wr *send_wr)
 
 static inline int ehca_write_swqe(struct ehca_qp *qp,
                                  struct ehca_wqe *wqe_p,
-                                 const struct ib_send_wr *send_wr)
+                                 const struct ib_send_wr *send_wr,
+                                 int hidden)
 {
        u32 idx;
        u64 dma_length;
@@ -176,7 +180,9 @@ static inline int ehca_write_swqe(struct ehca_qp *qp,
 
        wqe_p->wr_flag = 0;
 
-       if (send_wr->send_flags & IB_SEND_SIGNALED)
+       if ((send_wr->send_flags & IB_SEND_SIGNALED ||
+           qp->init_attr.sq_sig_type == IB_SIGNAL_ALL_WR)
+           && !hidden)
                wqe_p->wr_flag |= WQE_WRFLAG_REQ_SIGNAL_COM;
 
        if (send_wr->opcode == IB_WR_SEND_WITH_IMM ||
@@ -199,7 +205,7 @@ static inline int ehca_write_swqe(struct ehca_qp *qp,
 
                wqe_p->destination_qp_number = send_wr->wr.ud.remote_qpn << 8;
                wqe_p->local_ee_context_qkey = remote_qkey;
-               if (!send_wr->wr.ud.ah) {
+               if (unlikely(!send_wr->wr.ud.ah)) {
                        ehca_gen_err("wr.ud.ah is NULL. qp=%p", qp);
                        return -EINVAL;
                }
@@ -255,6 +261,15 @@ static inline int ehca_write_swqe(struct ehca_qp *qp,
                } /* eof idx */
                wqe_p->u.nud.atomic_1st_op_dma_len = dma_length;
 
+               /* unsolicited ack circumvention */
+               if (send_wr->opcode == IB_WR_RDMA_READ) {
+                       /* on RDMA read, switch on and reset counters */
+                       qp->message_count = qp->packet_count = 0;
+                       qp->unsol_ack_circ = 1;
+               } else
+                       /* else estimate #packets */
+                       qp->packet_count += (dma_length >> qp->mtu_shift) + 1;
+
                break;
 
        default:
@@ -355,13 +370,49 @@ static inline void map_ib_wc_status(u32 cqe_status,
                *wc_status = IB_WC_SUCCESS;
 }
 
+static inline int post_one_send(struct ehca_qp *my_qp,
+                        struct ib_send_wr *cur_send_wr,
+                        struct ib_send_wr **bad_send_wr,
+                        int hidden)
+{
+       struct ehca_wqe *wqe_p;
+       int ret;
+       u64 start_offset = my_qp->ipz_squeue.current_q_offset;
+
+       /* get pointer next to free WQE */
+       wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue);
+       if (unlikely(!wqe_p)) {
+               /* too many posted work requests: queue overflow */
+               if (bad_send_wr)
+                       *bad_send_wr = cur_send_wr;
+               ehca_err(my_qp->ib_qp.device, "Too many posted WQEs "
+                        "qp_num=%x", my_qp->ib_qp.qp_num);
+               return -ENOMEM;
+       }
+       /* write a SEND WQE into the QUEUE */
+       ret = ehca_write_swqe(my_qp, wqe_p, cur_send_wr, hidden);
+       /*
+        * if something failed,
+        * reset the free entry pointer to the start value
+        */
+       if (unlikely(ret)) {
+               my_qp->ipz_squeue.current_q_offset = start_offset;
+               if (bad_send_wr)
+                       *bad_send_wr = cur_send_wr;
+               ehca_err(my_qp->ib_qp.device, "Could not write WQE "
+                        "qp_num=%x", my_qp->ib_qp.qp_num);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
 int ehca_post_send(struct ib_qp *qp,
                   struct ib_send_wr *send_wr,
                   struct ib_send_wr **bad_send_wr)
 {
        struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
        struct ib_send_wr *cur_send_wr;
-       struct ehca_wqe *wqe_p;
        int wqe_cnt = 0;
        int ret = 0;
        unsigned long flags;
@@ -369,37 +420,33 @@ int ehca_post_send(struct ib_qp *qp,
        /* LOCK the QUEUE */
        spin_lock_irqsave(&my_qp->spinlock_s, flags);
 
+       /* Send an empty extra RDMA read if:
+        *  1) there has been an RDMA read on this connection before
+        *  2) no RDMA read occurred for ACK_CIRC_THRESHOLD link packets
+        *  3) we can be sure that any previous extra RDMA read has been
+        *     processed so we don't overflow the SQ
+        */
+       if (unlikely(my_qp->unsol_ack_circ &&
+                    my_qp->packet_count > ACK_CIRC_THRESHOLD &&
+                    my_qp->message_count > my_qp->init_attr.cap.max_send_wr)) {
+               /* insert an empty RDMA READ to fix up the remote QP state */
+               struct ib_send_wr circ_wr;
+               memset(&circ_wr, 0, sizeof(circ_wr));
+               circ_wr.opcode = IB_WR_RDMA_READ;
+               post_one_send(my_qp, &circ_wr, NULL, 1); /* ignore retcode */
+               wqe_cnt++;
+               ehca_dbg(qp->device, "posted circ wr  qp_num=%x", qp->qp_num);
+               my_qp->message_count = my_qp->packet_count = 0;
+       }
+
        /* loop processes list of send reqs */
        for (cur_send_wr = send_wr; cur_send_wr != NULL;
             cur_send_wr = cur_send_wr->next) {
-               u64 start_offset = my_qp->ipz_squeue.current_q_offset;
-               /* get pointer next to free WQE */
-               wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue);
-               if (unlikely(!wqe_p)) {
-                       /* too many posted work requests: queue overflow */
-                       if (bad_send_wr)
-                               *bad_send_wr = cur_send_wr;
-                       if (wqe_cnt == 0) {
-                               ret = -ENOMEM;
-                               ehca_err(qp->device, "Too many posted WQEs "
-                                        "qp_num=%x", qp->qp_num);
-                       }
-                       goto post_send_exit0;
-               }
-               /* write a SEND WQE into the QUEUE */
-               ret = ehca_write_swqe(my_qp, wqe_p, cur_send_wr);
-               /*
-                * if something failed,
-                * reset the free entry pointer to the start value
-                */
+               ret = post_one_send(my_qp, cur_send_wr, bad_send_wr, 0);
                if (unlikely(ret)) {
-                       my_qp->ipz_squeue.current_q_offset = start_offset;
-                       *bad_send_wr = cur_send_wr;
-                       if (wqe_cnt == 0) {
-                               ret = -EINVAL;
-                               ehca_err(qp->device, "Could not write WQE "
-                                        "qp_num=%x", qp->qp_num);
-                       }
+                       /* if one or more WQEs were successful, don't fail */
+                       if (wqe_cnt)
+                               ret = 0;
                        goto post_send_exit0;
                }
                wqe_cnt++;
@@ -410,6 +457,7 @@ int ehca_post_send(struct ib_qp *qp,
 post_send_exit0:
        iosync(); /* serialize GAL register access */
        hipz_update_sqa(my_qp, wqe_cnt);
+       my_qp->message_count += wqe_cnt;
        spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
        return ret;
 }