summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIsrael Rukshin <israelr@nvidia.com>2022-05-15 18:04:40 +0300
committerChristoph Hellwig <hch@lst.de>2022-07-12 17:34:35 +0200
commit6c16bb03731017adb66e6bf234e6ebd4a64fa926 (patch)
tree391f8f156ac70f168d6a6abfd091fe4c5782919a
parent375e2143d8f411c181eb630dd2f27a21e5a1a6e9 (diff)
nvme-rdma: remove timeout for getting RDMA-CM established event
In case many controllers start error recovery at the same time (i.e., when port is down and up), they may never succeed to reconnect again. This is because the target can't handle all the connect requests at three seconds (the arbitrary value set today). Even if some of the connections are established, when a single queue fails to connect, all the controller's queues are destroyed as well. So, on the following reconnection attempts the number of connect requests may remain the same. To fix this, remove the timeout and wait for RDMA-CM event to abort/complete the connect request. RDMA-CM sends unreachable event when a timeout of ~90 seconds is expired. This approach is used at other RDMA-CM users like SRP and iSER at blocking mode. The commit also renames NVME_RDMA_CONNECT_TIMEOUT_MS to NVME_RDMA_CM_TIMEOUT_MS. Signed-off-by: Israel Rukshin <israelr@nvidia.com> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com> Acked-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Christoph Hellwig <hch@lst.de>
-rw-r--r--drivers/nvme/host/rdma.c13
1 files changed, 5 insertions, 8 deletions
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 84ce3347d158..7d01fb770284 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -29,7 +29,7 @@
#include "fabrics.h"
-#define NVME_RDMA_CONNECT_TIMEOUT_MS 3000 /* 3 second */
+#define NVME_RDMA_CM_TIMEOUT_MS 3000 /* 3 second */
#define NVME_RDMA_MAX_SEGMENTS 256
@@ -248,12 +248,9 @@ static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
{
int ret;
- ret = wait_for_completion_interruptible_timeout(&queue->cm_done,
- msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1);
- if (ret < 0)
+ ret = wait_for_completion_interruptible(&queue->cm_done);
+ if (ret)
return ret;
- if (ret == 0)
- return -ETIMEDOUT;
WARN_ON_ONCE(queue->cm_error > 0);
return queue->cm_error;
}
@@ -612,7 +609,7 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
queue->cm_error = -ETIMEDOUT;
ret = rdma_resolve_addr(queue->cm_id, src_addr,
(struct sockaddr *)&ctrl->addr,
- NVME_RDMA_CONNECT_TIMEOUT_MS);
+ NVME_RDMA_CM_TIMEOUT_MS);
if (ret) {
dev_info(ctrl->ctrl.device,
"rdma_resolve_addr failed (%d).\n", ret);
@@ -1887,7 +1884,7 @@ static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
if (ctrl->opts->tos >= 0)
rdma_set_service_type(queue->cm_id, ctrl->opts->tos);
- ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
+ ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CM_TIMEOUT_MS);
if (ret) {
dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n",
queue->cm_error);