client: make SEND_ETA timeout non-fatal
authorJens Axboe <axboe@fb.com>
Tue, 15 Dec 2015 01:51:22 +0000 (18:51 -0700)
committerJens Axboe <axboe@fb.com>
Tue, 15 Dec 2015 01:51:22 +0000 (18:51 -0700)
If we fail a single ETA, then just soldier on. If we fail 5 in a row,
then give up permanently.

Also bring the command timeout back to 5s per command.

Signed-off-by: Jens Axboe <axboe@fb.com>
client.c
client.h
server.h

index 2cba8a034993a6aa55d351c5c3f8dfa7b5a8ef02..35cbf68cfa82860507ba12f616c27207a926f796 100644 (file)
--- a/client.c
+++ b/client.c
@@ -1159,6 +1159,7 @@ static void handle_eta(struct fio_client *client, struct fio_net_cmd *cmd)
 
        client->eta_in_flight = NULL;
        flist_del_init(&client->eta_list);
+       client->eta_timeouts = 0;
 
        if (client->ops->jobs_eta)
                client->ops->jobs_eta(client, je);
@@ -1588,6 +1589,34 @@ static void request_client_etas(struct client_ops *ops)
        dprint(FD_NET, "client: requested eta tag %p\n", eta);
 }
 
+/*
+ * A single SEND_ETA timeout isn't fatal. Attempt to recover.
+ */
+static int handle_cmd_timeout(struct fio_client *client,
+                             struct fio_net_cmd_reply *reply)
+{
+       if (reply->opcode != FIO_NET_CMD_SEND_ETA)
+               return 1;
+
+       log_info("client <%s>: timeout on SEND_ETA\n", client->hostname);
+       flist_del(&reply->list);
+       free(reply);
+
+       flist_del_init(&client->eta_list);
+       if (client->eta_in_flight) {
+               fio_client_dec_jobs_eta(client->eta_in_flight, client->ops->eta);
+               client->eta_in_flight = NULL;
+       }
+
+       /*
+        * If we fail 5 in a row, give up...
+        */
+       if (client->eta_timeouts++ > 5)
+               return 1;
+
+       return 0;
+}
+
 static int client_check_cmd_timeout(struct fio_client *client,
                                    struct timeval *now)
 {
@@ -1601,6 +1630,9 @@ static int client_check_cmd_timeout(struct fio_client *client,
                if (mtime_since(&reply->tv, now) < FIO_NET_CLIENT_TIMEOUT)
                        continue;
 
+               if (!handle_cmd_timeout(client, reply))
+                       continue;
+
                log_err("fio: client %s, timeout on cmd %s\n", client->hostname,
                                                fio_server_op(reply->opcode));
                flist_del(&reply->list);
index cfb0b4d4793bef27e5f2086f1b9c1219852aceee..46e30edce3c5f23167dadf6b3e9be4827aba2984 100644 (file)
--- a/client.h
+++ b/client.h
@@ -60,6 +60,7 @@ struct fio_client {
 
        struct flist_head eta_list;
        struct client_eta *eta_in_flight;
+       unsigned int eta_timeouts;
 
        struct flist_head cmd_list;
 
index 6747bf0241e1b6c523a912ef9c57ac21487f3043..d73ce1d635466f0111fe2e7973dc6026fd8240b5 100644 (file)
--- a/server.h
+++ b/server.h
@@ -74,7 +74,7 @@ enum {
 
        FIO_NET_NAME_MAX                = 256,
 
-       FIO_NET_CLIENT_TIMEOUT          = 30000,
+       FIO_NET_CLIENT_TIMEOUT          = 5000,
 
        FIO_PROBE_FLAG_ZLIB             = 1UL << 0,
 };