RDMA/rtrs-clt: Do stop and failover outside reconnect work.
authorJack Wang <jinpu.wang@ionos.com>
Fri, 14 Jan 2022 15:47:53 +0000 (16:47 +0100)
committerJason Gunthorpe <jgg@nvidia.com>
Fri, 28 Jan 2022 14:58:17 +0000 (10:58 -0400)
We can't do instant reconnect, not to DDoS server, but we should stop and
failover earlier, so there is less service interruption.

To avoid deadlock, as error_recovery is called from different callback
like rdma event or hb error handler, add a new err recovery_work.

Link: https://lore.kernel.org/r/20220114154753.983568-6-haris.iqbal@ionos.com
Signed-off-by: Jack Wang <jinpu.wang@ionos.com>
Reviewed-by: Aleksei Marov <aleksei.marov@ionos.com>
Reviewed-by: Md Haris Iqbal <haris.iqbal@ionos.com>
Signed-off-by: Md Haris Iqbal <haris.iqbal@ionos.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
drivers/infiniband/ulp/rtrs/rtrs-clt.c
drivers/infiniband/ulp/rtrs/rtrs-clt.h

index a866162465862c20c1aa33c66d328c623a27e3dd..b696aa4abae46d230cbe284ece2d539fc9dd2680 100644 (file)
@@ -297,6 +297,7 @@ static bool rtrs_clt_change_state_from_to(struct rtrs_clt_path *clt_path,
        return changed;
 }
 
+static void rtrs_clt_stop_and_destroy_conns(struct rtrs_clt_path *clt_path);
 static void rtrs_rdma_error_recovery(struct rtrs_clt_con *con)
 {
        struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
@@ -304,16 +305,7 @@ static void rtrs_rdma_error_recovery(struct rtrs_clt_con *con)
        if (rtrs_clt_change_state_from_to(clt_path,
                                           RTRS_CLT_CONNECTED,
                                           RTRS_CLT_RECONNECTING)) {
-               struct rtrs_clt_sess *clt = clt_path->clt;
-               unsigned int delay_ms;
-
-               /*
-                * Normal scenario, reconnect if we were successfully connected
-                */
-               delay_ms = clt->reconnect_delay_sec * 1000;
-               queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork,
-                                  msecs_to_jiffies(delay_ms +
-                                                   prandom_u32() % RTRS_RECONNECT_SEED));
+               queue_work(rtrs_wq, &clt_path->err_recovery_work);
        } else {
                /*
                 * Error can happen just on establishing new connection,
@@ -1511,6 +1503,22 @@ static void rtrs_clt_init_hb(struct rtrs_clt_path *clt_path)
 static void rtrs_clt_reconnect_work(struct work_struct *work);
 static void rtrs_clt_close_work(struct work_struct *work);
 
+static void rtrs_clt_err_recovery_work(struct work_struct *work)
+{
+       struct rtrs_clt_path *clt_path;
+       struct rtrs_clt_sess *clt;
+       int delay_ms;
+
+       clt_path = container_of(work, struct rtrs_clt_path, err_recovery_work);
+       clt = clt_path->clt;
+       delay_ms = clt->reconnect_delay_sec * 1000;
+       rtrs_clt_stop_and_destroy_conns(clt_path);
+       queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork,
+                          msecs_to_jiffies(delay_ms +
+                                           prandom_u32() %
+                                           RTRS_RECONNECT_SEED));
+}
+
 static struct rtrs_clt_path *alloc_path(struct rtrs_clt_sess *clt,
                                        const struct rtrs_addr *path,
                                        size_t con_num, u32 nr_poll_queues)
@@ -1562,6 +1570,7 @@ static struct rtrs_clt_path *alloc_path(struct rtrs_clt_sess *clt,
        clt_path->state = RTRS_CLT_CONNECTING;
        atomic_set(&clt_path->connected_cnt, 0);
        INIT_WORK(&clt_path->close_work, rtrs_clt_close_work);
+       INIT_WORK(&clt_path->err_recovery_work, rtrs_clt_err_recovery_work);
        INIT_DELAYED_WORK(&clt_path->reconnect_dwork, rtrs_clt_reconnect_work);
        rtrs_clt_init_hb(clt_path);
 
@@ -2326,6 +2335,7 @@ static void rtrs_clt_close_work(struct work_struct *work)
 
        clt_path = container_of(work, struct rtrs_clt_path, close_work);
 
+       cancel_work_sync(&clt_path->err_recovery_work);
        cancel_delayed_work_sync(&clt_path->reconnect_dwork);
        rtrs_clt_stop_and_destroy_conns(clt_path);
        rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CLOSED, NULL);
@@ -2638,7 +2648,6 @@ static void rtrs_clt_reconnect_work(struct work_struct *work)
 {
        struct rtrs_clt_path *clt_path;
        struct rtrs_clt_sess *clt;
-       unsigned int delay_ms;
        int err;
 
        clt_path = container_of(to_delayed_work(work), struct rtrs_clt_path,
@@ -2655,8 +2664,6 @@ static void rtrs_clt_reconnect_work(struct work_struct *work)
        }
        clt_path->reconnect_attempts++;
 
-       /* Stop everything */
-       rtrs_clt_stop_and_destroy_conns(clt_path);
        msleep(RTRS_RECONNECT_BACKOFF);
        if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CONNECTING, NULL)) {
                err = init_path(clt_path);
@@ -2669,11 +2676,7 @@ static void rtrs_clt_reconnect_work(struct work_struct *work)
 reconnect_again:
        if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_RECONNECTING, NULL)) {
                clt_path->stats->reconnects.fail_cnt++;
-               delay_ms = clt->reconnect_delay_sec * 1000;
-               queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork,
-                                  msecs_to_jiffies(delay_ms +
-                                                   prandom_u32() %
-                                                   RTRS_RECONNECT_SEED));
+               queue_work(rtrs_wq, &clt_path->err_recovery_work);
        }
 }
 
@@ -2905,6 +2908,7 @@ int rtrs_clt_reconnect_from_sysfs(struct rtrs_clt_path *clt_path)
                                                 &old_state);
        if (changed) {
                clt_path->reconnect_attempts = 0;
+               rtrs_clt_stop_and_destroy_conns(clt_path);
                queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork, 0);
        }
        if (changed || old_state == RTRS_CLT_RECONNECTING) {
index d1b18a154ae03b18f9a26b8b0ebd211373f10dc3..f848c0392d982a4a0e35e43ce7d211af4bca5034 100644 (file)
@@ -134,6 +134,7 @@ struct rtrs_clt_path {
        struct rtrs_clt_io_req  *reqs;
        struct delayed_work     reconnect_dwork;
        struct work_struct      close_work;
+       struct work_struct      err_recovery_work;
        unsigned int            reconnect_attempts;
        bool                    established;
        struct rtrs_rbuf        *rbufs;