nvme-rdma: don't suppress send completions

author Sagi Grimberg <sagi@grimberg.me>

Thu, 23 Nov 2017 15:35:21 +0000 (17:35 +0200)

committer Christoph Hellwig <hch@lst.de>

Sun, 26 Nov 2017 14:33:32 +0000 (15:33 +0100)
author Sagi Grimberg <sagi@grimberg.me>
Thu, 23 Nov 2017 15:35:21 +0000 (17:35 +0200)
committer Christoph Hellwig <hch@lst.de>
Sun, 26 Nov 2017 14:33:32 +0000 (15:33 +0100)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c

index 2c597105a6bf31e362d2a06a6289d4a0c1f28829..61511bed8acafb41852d804fc90b6c86bf8a40fe 100644 (file)
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -77,7 +77,6 @@ enum nvme_rdma_queue_flags {
  
  struct nvme_rdma_queue {
         struct nvme_rdma_qe     *rsp_ring;
-       atomic_t                sig_count;
         int                     queue_size;
         size_t                  cmnd_capsule_len;
         struct nvme_rdma_ctrl   *ctrl;
@@ -510,7 +509,6 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
                 queue->cmnd_capsule_len = sizeof(struct nvme_command);
  
         queue->queue_size = queue_size;
-       atomic_set(&queue->sig_count, 0);
  
         queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
                         RDMA_PS_TCP, IB_QPT_RC);
@@ -1204,21 +1202,9 @@ static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
                 nvme_rdma_wr_error(cq, wc, "SEND");
  }
  
-/*
- * We want to signal completion at least every queue depth/2.  This returns the
- * largest power of two that is not above half of (queue size + 1) to optimize
- * (avoid divisions).
- */
-static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue)
-{
-       int limit = 1 << ilog2((queue->queue_size + 1) / 2);
-
-       return (atomic_inc_return(&queue->sig_count) & (limit - 1)) == 0;
-}
-
  static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
                 struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
-               struct ib_send_wr *first, bool flush)
+               struct ib_send_wr *first)
  {
         struct ib_send_wr wr, *bad_wr;
         int ret;
@@ -1227,31 +1213,12 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
         sge->length = sizeof(struct nvme_command),
         sge->lkey   = queue->device->pd->local_dma_lkey;
  
-       qe->cqe.done = nvme_rdma_send_done;
-
         wr.next       = NULL;
         wr.wr_cqe     = &qe->cqe;
         wr.sg_list    = sge;
         wr.num_sge    = num_sge;
         wr.opcode     = IB_WR_SEND;
-       wr.send_flags = 0;
-
-       /*
-        * Unsignalled send completions are another giant desaster in the
-        * IB Verbs spec:  If we don't regularly post signalled sends
-        * the send queue will fill up and only a QP reset will rescue us.
-        * Would have been way to obvious to handle this in hardware or
-        * at least the RDMA stack..
-        *
-        * Always signal the flushes. The magic request used for the flush
-        * sequencer is not allocated in our driver's tagset and it's
-        * triggered to be freed by blk_cleanup_queue(). So we need to
-        * always mark it as signaled to ensure that the "wr_cqe", which is
-        * embedded in request's payload, is not freed when __ib_process_cq()
-        * calls wr_cqe->done().
-        */
-       if (nvme_rdma_queue_sig_limit(queue) || flush)
-               wr.send_flags |= IB_SEND_SIGNALED;
+       wr.send_flags = IB_SEND_SIGNALED;
  
         if (first)
                 first->next = &wr;
@@ -1301,6 +1268,12 @@ static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
         return queue->ctrl->tag_set.tags[queue_idx - 1];
  }
  
+static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+       if (unlikely(wc->status != IB_WC_SUCCESS))
+               nvme_rdma_wr_error(cq, wc, "ASYNC");
+}
+
  static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
  {
         struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
@@ -1319,10 +1292,12 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
         cmd->common.flags |= NVME_CMD_SGL_METABUF;
         nvme_rdma_set_sg_null(cmd);
  
+       sqe->cqe.done = nvme_rdma_async_done;
+
         ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
                         DMA_TO_DEVICE);
  
-       ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false);
+       ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL);
         WARN_ON_ONCE(ret);
  }
  
@@ -1607,7 +1582,6 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
         struct nvme_rdma_qe *sqe = &req->sqe;
         struct nvme_command *c = sqe->data;
-       bool flush = false;
         struct ib_device *dev;
         blk_status_t ret;
         int err;
@@ -1636,13 +1610,13 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
                 goto err;
         }
  
+       sqe->cqe.done = nvme_rdma_send_done;
+
         ib_dma_sync_single_for_device(dev, sqe->dma,
                         sizeof(struct nvme_command), DMA_TO_DEVICE);
  
-       if (req_op(rq) == REQ_OP_FLUSH)
-               flush = true;
         err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
-                       req->mr->need_inval ? &req->reg_wr.wr : NULL, flush);
+                       req->mr->need_inval ? &req->reg_wr.wr : NULL);
         if (unlikely(err)) {
                 nvme_rdma_unmap_data(queue, rq);
                 goto err;
author	Sagi Grimberg <sagi@grimberg.me>
	Thu, 23 Nov 2017 15:35:21 +0000 (17:35 +0200)
committer	Christoph Hellwig <hch@lst.de>
	Sun, 26 Nov 2017 14:33:32 +0000 (15:33 +0100)