iw_cxgb4: Support FW write completion WR
authorPotnuri Bharat Teja <bharat@chelsio.com>
Thu, 2 Aug 2018 06:03:04 +0000 (11:33 +0530)
committerJason Gunthorpe <jgg@mellanox.com>
Fri, 3 Aug 2018 02:16:02 +0000 (20:16 -0600)
To optimize NVME-oF READ IOPs, use a specialized WQE that combines
the RDMA WRITE and SEND_INV WR chain submitted by the NVME-oF target
driver.

This reduces uP overhead per NVME-oF IO, and results in over 10%
improvement in NVME-oF 4K READ IOPs.

Signed-off-by: Potnuri Bharat Teja <bharat@chelsio.com>
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
drivers/infiniband/hw/cxgb4/device.c
drivers/infiniband/hw/cxgb4/qp.c
drivers/infiniband/hw/cxgb4/t4.h
drivers/infiniband/hw/cxgb4/t4fw_ri_api.h

index 5ef082bfa95a0bd68212de66dc021499c601f4df..c13c0ba30f63e8f62694f896b8251bea9db5498a 100644 (file)
@@ -866,6 +866,7 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)
        rdev->status_page->qp_size = rdev->lldi.vr->qp.size;
        rdev->status_page->cq_start = rdev->lldi.vr->cq.start;
        rdev->status_page->cq_size = rdev->lldi.vr->cq.size;
+       rdev->status_page->write_cmpl_supported = rdev->lldi.write_cmpl_support;
 
        if (c4iw_wr_log) {
                rdev->wr_log = kcalloc(1 << c4iw_wr_log_size_order,
index 5d30cd14f795b2231629631084e660fe8d597a5a..b3203afa3b1de705c74da5aaec9ca55a3172b0d2 100644 (file)
@@ -455,7 +455,12 @@ static int build_isgl(__be64 *queue_start, __be64 *queue_end,
 {
        int i;
        u32 plen = 0;
-       __be64 *flitp = (__be64 *)isglp->sge;
+       __be64 *flitp;
+
+       if ((__be64 *)isglp == queue_end)
+               isglp = (struct fw_ri_isgl *)queue_start;
+
+       flitp = (__be64 *)isglp->sge;
 
        for (i = 0; i < num_sge; i++) {
                if ((plen + sg_list[i].length) < plen)
@@ -597,6 +602,56 @@ static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe,
        return 0;
 }
 
+static void build_immd_cmpl(struct t4_sq *sq, struct fw_ri_immd_cmpl *immdp,
+                           struct ib_send_wr *wr)
+{
+       memcpy((u8 *)immdp->data, (u8 *)(uintptr_t)wr->sg_list->addr, 16);
+       memset(immdp->r1, 0, 6);
+       immdp->op = FW_RI_DATA_IMMD;
+       immdp->immdlen = 16;
+}
+
+static void build_rdma_write_cmpl(struct t4_sq *sq,
+                                 struct fw_ri_rdma_write_cmpl_wr *wcwr,
+                                 const struct ib_send_wr *wr, u8 *len16)
+{
+       u32 plen;
+       int size;
+
+       /*
+        * This code assumes the struct fields preceding the write isgl
+        * fit in one 64B WR slot.  This is because the WQE is built
+        * directly in the dma queue, and wrapping is only handled
+        * by the code buildling sgls.  IE the "fixed part" of the wr
+        * structs must all fit in 64B.  The WQE build code should probably be
+        * redesigned to avoid this restriction, but for now just add
+        * the BUILD_BUG_ON() to catch if this WQE struct gets too big.
+        */
+       BUILD_BUG_ON(offsetof(struct fw_ri_rdma_write_cmpl_wr, u) > 64);
+
+       wcwr->stag_sink = cpu_to_be32(rdma_wr(wr)->rkey);
+       wcwr->to_sink = cpu_to_be64(rdma_wr(wr)->remote_addr);
+       wcwr->stag_inv = cpu_to_be32(wr->next->ex.invalidate_rkey);
+       wcwr->r2 = 0;
+       wcwr->r3 = 0;
+
+       /* SEND_INV SGL */
+       if (wr->next->send_flags & IB_SEND_INLINE)
+               build_immd_cmpl(sq, &wcwr->u_cmpl.immd_src, wr->next);
+       else
+               build_isgl((__be64 *)sq->queue, (__be64 *)&sq->queue[sq->size],
+                          &wcwr->u_cmpl.isgl_src, wr->next->sg_list, 1, NULL);
+
+       /* WRITE SGL */
+       build_isgl((__be64 *)sq->queue, (__be64 *)&sq->queue[sq->size],
+                  wcwr->u.isgl_src, wr->sg_list, wr->num_sge, &plen);
+
+       size = sizeof(*wcwr) + sizeof(struct fw_ri_isgl) +
+               wr->num_sge * sizeof(struct fw_ri_sge);
+       wcwr->plen = cpu_to_be32(plen);
+       *len16 = DIV_ROUND_UP(size, 16);
+}
+
 static int build_rdma_read(union t4_wr *wqe, const struct ib_send_wr *wr,
                           u8 *len16)
 {
@@ -627,6 +682,72 @@ static int build_rdma_read(union t4_wr *wqe, const struct ib_send_wr *wr,
        return 0;
 }
 
+static void post_write_cmpl(struct c4iw_qp *qhp, const struct ib_send_wr *wr)
+{
+       bool send_signaled = (wr->next->send_flags & IB_SEND_SIGNALED) ||
+                            qhp->sq_sig_all;
+       bool write_signaled = (wr->send_flags & IB_SEND_SIGNALED) ||
+                             qhp->sq_sig_all;
+       struct t4_swsqe *swsqe;
+       union t4_wr *wqe;
+       u16 write_wrid;
+       u8 len16;
+       u16 idx;
+
+       /*
+        * The sw_sq entries still look like a WRITE and a SEND and consume
+        * 2 slots. The FW WR, however, will be a single uber-WR.
+        */
+       wqe = (union t4_wr *)((u8 *)qhp->wq.sq.queue +
+              qhp->wq.sq.wq_pidx * T4_EQ_ENTRY_SIZE);
+       build_rdma_write_cmpl(&qhp->wq.sq, &wqe->write_cmpl, wr, &len16);
+
+       /* WRITE swsqe */
+       swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx];
+       swsqe->opcode = FW_RI_RDMA_WRITE;
+       swsqe->idx = qhp->wq.sq.pidx;
+       swsqe->complete = 0;
+       swsqe->signaled = write_signaled;
+       swsqe->flushed = 0;
+       swsqe->wr_id = wr->wr_id;
+       if (c4iw_wr_log) {
+               swsqe->sge_ts =
+                       cxgb4_read_sge_timestamp(qhp->rhp->rdev.lldi.ports[0]);
+               swsqe->host_time = ktime_get();
+       }
+
+       write_wrid = qhp->wq.sq.pidx;
+
+       /* just bump the sw_sq */
+       qhp->wq.sq.in_use++;
+       if (++qhp->wq.sq.pidx == qhp->wq.sq.size)
+               qhp->wq.sq.pidx = 0;
+
+       /* SEND_WITH_INV swsqe */
+       swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx];
+       swsqe->opcode = FW_RI_SEND_WITH_INV;
+       swsqe->idx = qhp->wq.sq.pidx;
+       swsqe->complete = 0;
+       swsqe->signaled = send_signaled;
+       swsqe->flushed = 0;
+       swsqe->wr_id = wr->next->wr_id;
+       if (c4iw_wr_log) {
+               swsqe->sge_ts =
+                       cxgb4_read_sge_timestamp(qhp->rhp->rdev.lldi.ports[0]);
+               swsqe->host_time = ktime_get();
+       }
+
+       wqe->write_cmpl.flags_send = send_signaled ? FW_RI_COMPLETION_FLAG : 0;
+       wqe->write_cmpl.wrid_send = qhp->wq.sq.pidx;
+
+       init_wr_hdr(wqe, write_wrid, FW_RI_RDMA_WRITE_CMPL_WR,
+                   write_signaled ? FW_RI_COMPLETION_FLAG : 0, len16);
+       t4_sq_produce(&qhp->wq, len16);
+       idx = DIV_ROUND_UP(len16 * 16, T4_EQ_ENTRY_SIZE);
+
+       t4_ring_sq_db(&qhp->wq, idx, wqe);
+}
+
 static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe,
                           const struct ib_recv_wr *wr, u8 *len16)
 {
@@ -1007,6 +1128,30 @@ int c4iw_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
                *bad_wr = wr;
                return -ENOMEM;
        }
+
+       /*
+        * Fastpath for NVMe-oF target WRITE + SEND_WITH_INV wr chain which is
+        * the response for small NVMEe-oF READ requests.  If the chain is
+        * exactly a WRITE->SEND_WITH_INV and the sgl depths and lengths
+        * meet the requirements of the fw_ri_write_cmpl_wr work request,
+        * then build and post the write_cmpl WR.  If any of the tests
+        * below are not true, then we continue on with the tradtional WRITE
+        * and SEND WRs.
+        */
+       if (qhp->rhp->rdev.lldi.write_cmpl_support &&
+           CHELSIO_CHIP_VERSION(qhp->rhp->rdev.lldi.adapter_type) >=
+           CHELSIO_T5 &&
+           wr && wr->next && !wr->next->next &&
+           wr->opcode == IB_WR_RDMA_WRITE &&
+           wr->sg_list[0].length && wr->num_sge <= T4_WRITE_CMPL_MAX_SGL &&
+           wr->next->opcode == IB_WR_SEND_WITH_INV &&
+           wr->next->sg_list[0].length == T4_WRITE_CMPL_MAX_CQE &&
+           wr->next->num_sge == 1 && num_wrs >= 2) {
+               post_write_cmpl(qhp, wr);
+               spin_unlock_irqrestore(&qhp->lock, flag);
+               return 0;
+       }
+
        while (wr) {
                if (num_wrs == 0) {
                        err = -ENOMEM;
index 0fb3e55f37c19e350784e0b53f5c6724d2d1118f..e42021fd6fd60b0cd2488182998723b6146a25d0 100644 (file)
@@ -91,6 +91,9 @@ static inline int t4_max_fr_depth(int use_dsgl)
 #define T4_RQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_RQ_NUM_SLOTS)
 #define T4_MAX_RECV_SGE 4
 
+#define T4_WRITE_CMPL_MAX_SGL 4
+#define T4_WRITE_CMPL_MAX_CQE 16
+
 union t4_wr {
        struct fw_ri_res_wr res;
        struct fw_ri_wr ri;
@@ -101,6 +104,7 @@ union t4_wr {
        struct fw_ri_fr_nsmr_wr fr;
        struct fw_ri_fr_nsmr_tpte_wr fr_tpte;
        struct fw_ri_inv_lstag_wr inv;
+       struct fw_ri_rdma_write_cmpl_wr write_cmpl;
        struct t4_status_page status;
        __be64 flits[T4_EQ_ENTRY_SIZE / sizeof(__be64) * T4_SQ_NUM_SLOTS];
 };
@@ -851,7 +855,7 @@ static inline void t4_set_cq_in_error(struct t4_cq *cq)
 
 struct t4_dev_status_page {
        u8 db_off;
-       u8 pad1;
+       u8 write_cmpl_supported;
        u16 pad2;
        u32 pad3;
        u64 qp_start;
index 62606e66ba2038943a752e3a8684af6dfc8786a1..cbdb300a47943a73461a92ae0e7bfe1952d4fe82 100644 (file)
@@ -595,6 +595,37 @@ struct fw_ri_send_wr {
 #define FW_RI_SEND_WR_SENDOP_G(x)      \
        (((x) >> FW_RI_SEND_WR_SENDOP_S) & FW_RI_SEND_WR_SENDOP_M)
 
+struct fw_ri_rdma_write_cmpl_wr {
+       __u8   opcode;
+       __u8   flags;
+       __u16  wrid;
+       __u8   r1[3];
+       __u8   len16;
+       __u8   r2;
+       __u8   flags_send;
+       __u16  wrid_send;
+       __be32 stag_inv;
+       __be32 plen;
+       __be32 stag_sink;
+       __be64 to_sink;
+       union fw_ri_cmpl {
+               struct fw_ri_immd_cmpl {
+                       __u8   op;
+                       __u8   r1[6];
+                       __u8   immdlen;
+                       __u8   data[16];
+               } immd_src;
+               struct fw_ri_isgl isgl_src;
+       } u_cmpl;
+       __be64 r3;
+#ifndef C99_NOT_SUPPORTED
+       union fw_ri_write {
+               struct fw_ri_immd immd_src[0];
+               struct fw_ri_isgl isgl_src[0];
+       } u;
+#endif
+};
+
 struct fw_ri_rdma_read_wr {
        __u8   opcode;
        __u8   flags;