rxrpc: Implement RACK/TLP to deal with transmission stalls [RFC8985]
authorDavid Howells <dhowells@redhat.com>
Wed, 4 Dec 2024 07:47:07 +0000 (07:47 +0000)
committerJakub Kicinski <kuba@kernel.org>
Mon, 9 Dec 2024 21:48:33 +0000 (13:48 -0800)
When an rxrpc call is in its transmission phase and is sending a lot of
packets, stalls occasionally occur that cause severe performance
degradation (eg. increasing the transmission time for a 256MiB payload from
0.7s to 2.5s over a 10G link).

rxrpc already implements TCP-style congestion control [RFC5681] and this
helps mitigate the effects, but occasionally we're missing a time event
that deals with a missing ACK, leading to a stall until the RTO expires.

Fix this by implementing RACK/TLP in rxrpc.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
include/trace/events/rxrpc.h
net/rxrpc/Makefile
net/rxrpc/ar-internal.h
net/rxrpc/call_event.c
net/rxrpc/call_object.c
net/rxrpc/input.c
net/rxrpc/input_rack.c [new file with mode: 0644]
net/rxrpc/io_thread.c
net/rxrpc/output.c

index 71df5c48a413d0978c4a96cc02cc7100f4c31c5d..2f119d18a061fe6706456e629ccfdf5282bf98be 100644 (file)
 #define rxrpc_txdata_traces \
        EM(rxrpc_txdata_inject_loss,            " *INJ-LOSS*") \
        EM(rxrpc_txdata_new_data,               " ") \
-       E_(rxrpc_txdata_retransmit,             " *RETRANS*")
+       EM(rxrpc_txdata_retransmit,             " *RETRANS*") \
+       EM(rxrpc_txdata_tlp_new_data,           " *TLP-NEW*") \
+       E_(rxrpc_txdata_tlp_retransmit,         " *TLP-RETRANS*")
 
 #define rxrpc_receive_traces \
        EM(rxrpc_receive_end,                   "END") \
        EM(rxrpc_timer_trace_hard,              "HardLimit") \
        EM(rxrpc_timer_trace_idle,              "IdleLimit") \
        EM(rxrpc_timer_trace_keepalive,         "KeepAlive") \
-       EM(rxrpc_timer_trace_lost_ack,          "LostAck  ") \
        EM(rxrpc_timer_trace_ping,              "DelayPing") \
-       EM(rxrpc_timer_trace_resend,            "Resend   ") \
-       EM(rxrpc_timer_trace_resend_reset,      "ResendRst") \
-       E_(rxrpc_timer_trace_resend_tx,         "ResendTx ")
+       EM(rxrpc_timer_trace_rack_off,          "RACK-OFF ") \
+       EM(rxrpc_timer_trace_rack_zwp,          "RACK-ZWP ") \
+       EM(rxrpc_timer_trace_rack_reo,          "RACK-Reo ") \
+       EM(rxrpc_timer_trace_rack_tlp_pto,      "TLP-PTO  ") \
+       E_(rxrpc_timer_trace_rack_rto,          "RTO      ")
 
 #define rxrpc_propose_ack_traces \
        EM(rxrpc_propose_ack_client_tx_end,     "ClTxEnd") \
        EM(rxrpc_txbuf_put_rotated,             "PUT ROTATED")  \
        EM(rxrpc_txbuf_put_send_aborted,        "PUT SEND-X ")  \
        EM(rxrpc_txbuf_put_trans,               "PUT TRANS  ")  \
+       EM(rxrpc_txbuf_see_lost,                "SEE LOST   ")  \
        EM(rxrpc_txbuf_see_out_of_step,         "OUT-OF-STEP")  \
-       EM(rxrpc_txbuf_see_send_more,           "SEE SEND+  ")  \
-       E_(rxrpc_txbuf_see_unacked,             "SEE UNACKED")
+       E_(rxrpc_txbuf_see_send_more,           "SEE SEND+  ")
 
 #define rxrpc_tq_traces \
        EM(rxrpc_tq_alloc,                      "ALLOC") \
        EM(rxrpc_rotate_trace_sack,             "soft-ack")     \
        E_(rxrpc_rotate_trace_snak,             "soft-nack")
 
+#define rxrpc_rack_timer_modes \
+       EM(RXRPC_CALL_RACKTIMER_OFF,            "---") \
+       EM(RXRPC_CALL_RACKTIMER_RACK_REORDER,   "REO") \
+       EM(RXRPC_CALL_RACKTIMER_TLP_PTO,        "TLP") \
+       E_(RXRPC_CALL_RACKTIMER_RTO,            "RTO")
+
+#define rxrpc_tlp_probe_traces \
+       EM(rxrpc_tlp_probe_trace_busy,          "busy")         \
+       EM(rxrpc_tlp_probe_trace_transmit_new,  "transmit-new") \
+       E_(rxrpc_tlp_probe_trace_retransmit,    "retransmit")
+
+#define rxrpc_tlp_ack_traces \
+       EM(rxrpc_tlp_ack_trace_acked,           "acked")        \
+       EM(rxrpc_tlp_ack_trace_dup_acked,       "dup-acked")    \
+       EM(rxrpc_tlp_ack_trace_hard_beyond,     "hard-beyond")  \
+       EM(rxrpc_tlp_ack_trace_incomplete,      "incomplete")   \
+       E_(rxrpc_tlp_ack_trace_new_data,        "new-data")
+
 /*
  * Generate enums for tracing information.
  */
@@ -537,6 +558,8 @@ enum rxrpc_rtt_tx_trace             { rxrpc_rtt_tx_traces } __mode(byte);
 enum rxrpc_sack_trace          { rxrpc_sack_traces } __mode(byte);
 enum rxrpc_skb_trace           { rxrpc_skb_traces } __mode(byte);
 enum rxrpc_timer_trace         { rxrpc_timer_traces } __mode(byte);
+enum rxrpc_tlp_ack_trace       { rxrpc_tlp_ack_traces } __mode(byte);
+enum rxrpc_tlp_probe_trace     { rxrpc_tlp_probe_traces } __mode(byte);
 enum rxrpc_tq_trace            { rxrpc_tq_traces } __mode(byte);
 enum rxrpc_tx_point            { rxrpc_tx_points } __mode(byte);
 enum rxrpc_txbuf_trace         { rxrpc_txbuf_traces } __mode(byte);
@@ -567,6 +590,7 @@ rxrpc_conn_traces;
 rxrpc_local_traces;
 rxrpc_pmtud_reduce_traces;
 rxrpc_propose_ack_traces;
+rxrpc_rack_timer_modes;
 rxrpc_receive_traces;
 rxrpc_recvmsg_traces;
 rxrpc_req_ack_traces;
@@ -576,6 +600,8 @@ rxrpc_rtt_tx_traces;
 rxrpc_sack_traces;
 rxrpc_skb_traces;
 rxrpc_timer_traces;
+rxrpc_tlp_ack_traces;
+rxrpc_tlp_probe_traces;
 rxrpc_tq_traces;
 rxrpc_tx_points;
 rxrpc_txbuf_traces;
@@ -618,6 +644,20 @@ TRACE_EVENT(rxrpc_local,
                      __entry->usage)
            );
 
+TRACE_EVENT(rxrpc_iothread_rx,
+           TP_PROTO(struct rxrpc_local *local, unsigned int nr_rx),
+           TP_ARGS(local, nr_rx),
+           TP_STRUCT__entry(
+                   __field(unsigned int,       local)
+                   __field(unsigned int,       nr_rx)
+                            ),
+           TP_fast_assign(
+                   __entry->local = local->debug_id;
+                   __entry->nr_rx = nr_rx;
+                          ),
+           TP_printk("L=%08x nrx=%u", __entry->local, __entry->nr_rx)
+           );
+
 TRACE_EVENT(rxrpc_peer,
            TP_PROTO(unsigned int peer_debug_id, int ref, enum rxrpc_peer_trace why),
 
@@ -1684,16 +1724,15 @@ TRACE_EVENT(rxrpc_drop_ack,
 TRACE_EVENT(rxrpc_retransmit,
            TP_PROTO(struct rxrpc_call *call,
                     struct rxrpc_send_data_req *req,
-                    struct rxrpc_txbuf *txb, ktime_t expiry),
+                    struct rxrpc_txbuf *txb),
 
-           TP_ARGS(call, req, txb, expiry),
+           TP_ARGS(call, req, txb),
 
            TP_STRUCT__entry(
                    __field(unsigned int,       call)
                    __field(unsigned int,       qbase)
                    __field(rxrpc_seq_t,        seq)
                    __field(rxrpc_serial_t,     serial)
-                   __field(ktime_t,            expiry)
                             ),
 
            TP_fast_assign(
@@ -1701,15 +1740,13 @@ TRACE_EVENT(rxrpc_retransmit,
                    __entry->qbase = req->tq->qbase;
                    __entry->seq = req->seq;
                    __entry->serial = txb->serial;
-                   __entry->expiry = expiry;
                           ),
 
-           TP_printk("c=%08x tq=%x q=%x r=%x xp=%lld",
+           TP_printk("c=%08x tq=%x q=%x r=%x",
                      __entry->call,
                      __entry->qbase,
                      __entry->seq,
-                     __entry->serial,
-                     ktime_to_us(__entry->expiry))
+                     __entry->serial)
            );
 
 TRACE_EVENT(rxrpc_congest,
@@ -1767,9 +1804,9 @@ TRACE_EVENT(rxrpc_congest,
            );
 
 TRACE_EVENT(rxrpc_reset_cwnd,
-           TP_PROTO(struct rxrpc_call *call, ktime_t now),
+           TP_PROTO(struct rxrpc_call *call, ktime_t since_last_tx, ktime_t rtt),
 
-           TP_ARGS(call, now),
+           TP_ARGS(call, since_last_tx, rtt),
 
            TP_STRUCT__entry(
                    __field(unsigned int,               call)
@@ -1779,6 +1816,7 @@ TRACE_EVENT(rxrpc_reset_cwnd,
                    __field(rxrpc_seq_t,                hard_ack)
                    __field(rxrpc_seq_t,                prepared)
                    __field(ktime_t,                    since_last_tx)
+                   __field(ktime_t,                    rtt)
                    __field(bool,                       has_data)
                             ),
 
@@ -1789,18 +1827,20 @@ TRACE_EVENT(rxrpc_reset_cwnd,
                    __entry->extra      = call->cong_extra;
                    __entry->hard_ack   = call->acks_hard_ack;
                    __entry->prepared   = call->send_top - call->tx_bottom;
-                   __entry->since_last_tx = ktime_sub(now, call->tx_last_sent);
+                   __entry->since_last_tx = since_last_tx;
+                   __entry->rtt        = rtt;
                    __entry->has_data   = call->tx_bottom != call->tx_top;
                           ),
 
-           TP_printk("c=%08x q=%08x %s cw=%u+%u pr=%u tm=%llu d=%u",
+           TP_printk("c=%08x q=%08x %s cw=%u+%u pr=%u tm=%llu/%llu d=%u",
                      __entry->call,
                      __entry->hard_ack,
                      __print_symbolic(__entry->ca_state, rxrpc_ca_states),
                      __entry->cwnd,
                      __entry->extra,
                      __entry->prepared,
-                     ktime_to_ns(__entry->since_last_tx),
+                     ktime_to_us(__entry->since_last_tx),
+                     ktime_to_us(__entry->rtt),
                      __entry->has_data)
            );
 
@@ -1925,6 +1965,32 @@ TRACE_EVENT(rxrpc_resend,
                      __entry->transmitted)
            );
 
+TRACE_EVENT(rxrpc_resend_lost,
+           TP_PROTO(struct rxrpc_call *call, struct rxrpc_txqueue *tq, unsigned long lost),
+
+           TP_ARGS(call, tq, lost),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int,       call)
+                   __field(rxrpc_seq_t,        qbase)
+                   __field(u8,                 nr_rep)
+                   __field(unsigned long,      lost)
+                            ),
+
+           TP_fast_assign(
+                   __entry->call = call->debug_id;
+                   __entry->qbase = tq->qbase;
+                   __entry->nr_rep = tq->nr_reported_acks;
+                   __entry->lost = lost;
+                          ),
+
+           TP_printk("c=%08x tq=%x lost=%016lx nr=%u",
+                     __entry->call,
+                     __entry->qbase,
+                     __entry->lost,
+                     __entry->nr_rep)
+           );
+
 TRACE_EVENT(rxrpc_rotate,
            TP_PROTO(struct rxrpc_call *call, struct rxrpc_txqueue *tq,
                     struct rxrpc_ack_summary *summary, rxrpc_seq_t seq,
@@ -2363,6 +2429,244 @@ TRACE_EVENT(rxrpc_pmtud_reduce,
                      __entry->serial, __entry->max_data)
            );
 
+TRACE_EVENT(rxrpc_rack,
+           TP_PROTO(struct rxrpc_call *call, ktime_t timo),
+
+           TP_ARGS(call, timo),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int,       call)
+                   __field(rxrpc_serial_t,     ack_serial)
+                   __field(rxrpc_seq_t,        seq)
+                   __field(enum rxrpc_rack_timer_mode, mode)
+                   __field(unsigned short,     nr_sent)
+                   __field(unsigned short,     nr_lost)
+                   __field(unsigned short,     nr_resent)
+                   __field(unsigned short,     nr_sacked)
+                   __field(ktime_t,            timo)
+                            ),
+
+           TP_fast_assign(
+                   __entry->call       = call->debug_id;
+                   __entry->ack_serial = call->rx_serial;
+                   __entry->seq        = call->rack_end_seq;
+                   __entry->mode       = call->rack_timer_mode;
+                   __entry->nr_sent    = call->tx_nr_sent;
+                   __entry->nr_lost    = call->tx_nr_lost;
+                   __entry->nr_resent  = call->tx_nr_resent;
+                   __entry->nr_sacked  = call->acks_nr_sacks;
+                   __entry->timo       = timo;
+                          ),
+
+           TP_printk("c=%08x r=%08x q=%08x %s slrs=%u,%u,%u,%u t=%lld",
+                     __entry->call, __entry->ack_serial, __entry->seq,
+                     __print_symbolic(__entry->mode, rxrpc_rack_timer_modes),
+                     __entry->nr_sent, __entry->nr_lost,
+                     __entry->nr_resent, __entry->nr_sacked,
+                     ktime_to_us(__entry->timo))
+           );
+
+TRACE_EVENT(rxrpc_rack_update,
+           TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary),
+
+           TP_ARGS(call, summary),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int,       call)
+                   __field(rxrpc_serial_t,     ack_serial)
+                   __field(rxrpc_seq_t,        seq)
+                   __field(int,                xmit_ts)
+                            ),
+
+           TP_fast_assign(
+                   __entry->call       = call->debug_id;
+                   __entry->ack_serial = call->rx_serial;
+                   __entry->seq        = call->rack_end_seq;
+                   __entry->xmit_ts    = ktime_sub(call->acks_latest_ts, call->rack_xmit_ts);
+                          ),
+
+           TP_printk("c=%08x r=%08x q=%08x xt=%lld",
+                     __entry->call, __entry->ack_serial, __entry->seq,
+                     ktime_to_us(__entry->xmit_ts))
+           );
+
+TRACE_EVENT(rxrpc_rack_scan_loss,
+           TP_PROTO(struct rxrpc_call *call),
+
+           TP_ARGS(call),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int,       call)
+                   __field(ktime_t,            rack_rtt)
+                   __field(ktime_t,            rack_reo_wnd)
+                            ),
+
+           TP_fast_assign(
+                   __entry->call               = call->debug_id;
+                   __entry->rack_rtt           = call->rack_rtt;
+                   __entry->rack_reo_wnd       = call->rack_reo_wnd;
+                          ),
+
+           TP_printk("c=%08x rtt=%lld reow=%lld",
+                     __entry->call, ktime_to_us(__entry->rack_rtt),
+                     ktime_to_us(__entry->rack_reo_wnd))
+           );
+
+TRACE_EVENT(rxrpc_rack_scan_loss_tq,
+           TP_PROTO(struct rxrpc_call *call, const struct rxrpc_txqueue *tq,
+                    unsigned long nacks),
+
+           TP_ARGS(call, tq, nacks),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int,       call)
+                   __field(rxrpc_seq_t,        qbase)
+                   __field(unsigned long,      nacks)
+                   __field(unsigned long,      lost)
+                   __field(unsigned long,      retrans)
+                            ),
+
+           TP_fast_assign(
+                   __entry->call       = call->debug_id;
+                   __entry->qbase      = tq->qbase;
+                   __entry->nacks      = nacks;
+                   __entry->lost       = tq->segment_lost;
+                   __entry->retrans    = tq->segment_retransmitted;
+                          ),
+
+           TP_printk("c=%08x q=%08x n=%lx l=%lx r=%lx",
+                     __entry->call, __entry->qbase,
+                     __entry->nacks, __entry->lost, __entry->retrans)
+           );
+
+TRACE_EVENT(rxrpc_rack_detect_loss,
+           TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary,
+                    rxrpc_seq_t seq),
+
+           TP_ARGS(call, summary, seq),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int,       call)
+                   __field(rxrpc_serial_t,     ack_serial)
+                   __field(rxrpc_seq_t,        seq)
+                            ),
+
+           TP_fast_assign(
+                   __entry->call       = call->debug_id;
+                   __entry->ack_serial = call->rx_serial;
+                   __entry->seq        = seq;
+                          ),
+
+           TP_printk("c=%08x r=%08x q=%08x",
+                     __entry->call, __entry->ack_serial, __entry->seq)
+           );
+
+TRACE_EVENT(rxrpc_rack_mark_loss_tq,
+           TP_PROTO(struct rxrpc_call *call, const struct rxrpc_txqueue *tq),
+
+           TP_ARGS(call, tq),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int,       call)
+                   __field(rxrpc_seq_t,        qbase)
+                   __field(rxrpc_seq_t,        trans)
+                   __field(unsigned long,      acked)
+                   __field(unsigned long,      lost)
+                   __field(unsigned long,      retrans)
+                            ),
+
+           TP_fast_assign(
+                   __entry->call       = call->debug_id;
+                   __entry->qbase      = tq->qbase;
+                   __entry->trans      = call->tx_transmitted;
+                   __entry->acked      = tq->segment_acked;
+                   __entry->lost       = tq->segment_lost;
+                   __entry->retrans    = tq->segment_retransmitted;
+                          ),
+
+           TP_printk("c=%08x tq=%08x txq=%08x a=%lx l=%lx r=%lx",
+                     __entry->call, __entry->qbase, __entry->trans,
+                     __entry->acked, __entry->lost, __entry->retrans)
+           );
+
+TRACE_EVENT(rxrpc_tlp_probe,
+           TP_PROTO(struct rxrpc_call *call, enum rxrpc_tlp_probe_trace trace),
+
+           TP_ARGS(call, trace),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int,               call)
+                   __field(rxrpc_serial_t,             serial)
+                   __field(rxrpc_seq_t,                seq)
+                   __field(enum rxrpc_tlp_probe_trace, trace)
+                            ),
+
+           TP_fast_assign(
+                   __entry->call       = call->debug_id;
+                   __entry->serial     = call->tlp_serial;
+                   __entry->seq        = call->tlp_seq;
+                   __entry->trace      = trace;
+                          ),
+
+           TP_printk("c=%08x r=%08x pq=%08x %s",
+                     __entry->call, __entry->serial, __entry->seq,
+                     __print_symbolic(__entry->trace, rxrpc_tlp_probe_traces))
+           );
+
+TRACE_EVENT(rxrpc_tlp_ack,
+           TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary,
+                    enum rxrpc_tlp_ack_trace trace),
+
+           TP_ARGS(call, summary, trace),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int,               call)
+                   __field(rxrpc_serial_t,             serial)
+                   __field(rxrpc_seq_t,                tlp_seq)
+                   __field(rxrpc_seq_t,                hard_ack)
+                   __field(enum rxrpc_tlp_ack_trace,   trace)
+                            ),
+
+           TP_fast_assign(
+                   __entry->call       = call->debug_id;
+                   __entry->serial     = call->tlp_serial;
+                   __entry->tlp_seq    = call->tlp_seq;
+                   __entry->hard_ack   = call->acks_hard_ack;
+                   __entry->trace      = trace;
+                          ),
+
+           TP_printk("c=%08x r=%08x pq=%08x hq=%08x %s",
+                     __entry->call, __entry->serial,
+                     __entry->tlp_seq, __entry->hard_ack,
+                     __print_symbolic(__entry->trace, rxrpc_tlp_ack_traces))
+           );
+
+TRACE_EVENT(rxrpc_rack_timer,
+           TP_PROTO(struct rxrpc_call *call, ktime_t delay, bool exp),
+
+           TP_ARGS(call, delay, exp),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int,               call)
+                   __field(bool,                       exp)
+                   __field(enum rxrpc_rack_timer_mode, mode)
+                   __field(ktime_t,                    delay)
+                            ),
+
+           TP_fast_assign(
+                   __entry->call               = call->debug_id;
+                   __entry->exp                = exp;
+                   __entry->mode               = call->rack_timer_mode;
+                   __entry->delay              = delay;
+                          ),
+
+           TP_printk("c=%08x %s %s to=%lld",
+                     __entry->call,
+                     __entry->exp ? "Exp" : "Set",
+                     __print_symbolic(__entry->mode, rxrpc_rack_timer_modes),
+                     ktime_to_us(__entry->delay))
+           );
+
 #undef EM
 #undef E_
 
index ac5caf5a48e1616ab456df7a124a663156bac660..210b75e3179e9874737fb4cfd5f8cada45afb719 100644 (file)
@@ -16,6 +16,7 @@ rxrpc-y := \
        conn_object.o \
        conn_service.o \
        input.o \
+       input_rack.o \
        insecure.o \
        io_thread.o \
        key.o \
index a9d732ba6df0d182e21a4c7258ddffac9783a367..0c0a3c89dba3258293a4aba2e11ce606882c6f34 100644 (file)
@@ -621,6 +621,18 @@ enum rxrpc_ca_state {
        NR__RXRPC_CA_STATES
 } __mode(byte);
 
+/*
+ * Current purpose of call RACK timer.  According to the RACK-TLP protocol
+ * [RFC8985], the transmission timer (call->rack_timo_at) may only be used for
+ * one of these at once.
+ */
+enum rxrpc_rack_timer_mode {
+       RXRPC_CALL_RACKTIMER_OFF,               /* Timer not running */
+       RXRPC_CALL_RACKTIMER_RACK_REORDER,      /* RACK reordering timer */
+       RXRPC_CALL_RACKTIMER_TLP_PTO,           /* TLP timeout */
+       RXRPC_CALL_RACKTIMER_RTO,               /* Retransmission timeout */
+} __mode(byte);
+
 /*
  * RxRPC call definition
  * - matched by { connection, call_id }
@@ -638,8 +650,7 @@ struct rxrpc_call {
        struct mutex            user_mutex;     /* User access mutex */
        struct sockaddr_rxrpc   dest_srx;       /* Destination address */
        ktime_t                 delay_ack_at;   /* When DELAY ACK needs to happen */
-       ktime_t                 ack_lost_at;    /* When ACK is figured as lost */
-       ktime_t                 resend_at;      /* When next resend needs to happen */
+       ktime_t                 rack_timo_at;   /* When ACK is figured as lost */
        ktime_t                 ping_at;        /* When next to send a ping */
        ktime_t                 keepalive_at;   /* When next to send a keepalive ping */
        ktime_t                 expect_rx_by;   /* When we expect to get a packet by */
@@ -695,8 +706,12 @@ struct rxrpc_call {
        rxrpc_seq_t             tx_bottom;      /* First packet in buffer */
        rxrpc_seq_t             tx_transmitted; /* Highest packet transmitted */
        rxrpc_seq_t             tx_top;         /* Highest Tx slot allocated. */
+       rxrpc_serial_t          tx_last_serial; /* Serial of last DATA transmitted */
        u16                     tx_backoff;     /* Delay to insert due to Tx failure (ms) */
-       u8                      tx_winsize;     /* Maximum size of Tx window */
+       u16                     tx_nr_sent;     /* Number of packets sent, but unacked */
+       u16                     tx_nr_lost;     /* Number of packets marked lost */
+       u16                     tx_nr_resent;   /* Number of packets resent, but unacked */
+       u16                     tx_winsize;     /* Maximum size of Tx window */
 #define RXRPC_TX_MAX_WINDOW    128
        u8                      tx_jumbo_max;   /* Maximum subpkts peer will accept */
        ktime_t                 tx_last_sent;   /* Last time a transmission occurred */
@@ -725,6 +740,25 @@ struct rxrpc_call {
        u16                     cong_cumul_acks; /* Cumulative ACK count */
        ktime_t                 cong_tstamp;    /* Last time cwnd was changed */
 
+       /* RACK-TLP [RFC8985] state. */
+       ktime_t                 rack_xmit_ts;   /* Latest transmission timestamp */
+       ktime_t                 rack_rtt;       /* RTT of most recently ACK'd segment */
+       ktime_t                 rack_rtt_ts;    /* Timestamp of rack_rtt */
+       ktime_t                 rack_reo_wnd;   /* Reordering window */
+       unsigned int            rack_reo_wnd_mult; /* Multiplier applied to rack_reo_wnd */
+       int                     rack_reo_wnd_persist; /* Num loss recoveries before reset reo_wnd */
+       rxrpc_seq_t             rack_fack;      /* Highest sequence so far ACK'd */
+       rxrpc_seq_t             rack_end_seq;   /* Highest sequence seen */
+       rxrpc_seq_t             rack_dsack_round; /* DSACK opt recv'd in latest roundtrip */
+       bool                    rack_dsack_round_none; /* T if dsack_round is "None" */
+       bool                    rack_reordering_seen; /* T if detected reordering event */
+       enum rxrpc_rack_timer_mode rack_timer_mode; /* Current mode of RACK timer */
+       bool                    tlp_is_retrans; /* T if unacked TLP retransmission */
+       rxrpc_serial_t          tlp_serial;     /* Serial of TLP probe (or 0 if none in progress) */
+       rxrpc_seq_t             tlp_seq;        /* Sequence of TLP probe */
+       unsigned int            tlp_rtt_taken;  /* Last time RTT taken */
+       ktime_t                 tlp_max_ack_delay; /* Sender budget for max delayed ACK interval */
+
        /* Receive-phase ACK management (ACKs we send). */
        u8                      ackr_reason;    /* reason to ACK */
        u16                     ackr_sack_base; /* Starting slot in SACK table ring */
@@ -783,6 +817,9 @@ struct rxrpc_ack_summary {
        bool            retrans_timeo:1;        /* T if reTx due to timeout happened */
        bool            need_retransmit:1;      /* T if we need transmission */
        bool            rtt_sample_avail:1;     /* T if RTT sample available */
+       bool            in_fast_or_rto_recovery:1;
+       bool            exiting_fast_or_rto_recovery:1;
+       bool            tlp_probe_acked:1;      /* T if the TLP probe seq was acked */
        u8 /*enum rxrpc_congest_change*/ change;
 };
 
@@ -864,6 +901,7 @@ struct rxrpc_txqueue {
        unsigned long           segment_lost;   /* Bit-per-buf: Set if declared lost */
        unsigned long           segment_retransmitted; /* Bit-per-buf: Set if retransmitted */
        unsigned long           rtt_samples;    /* Bit-per-buf: Set if available for RTT */
+       unsigned long           ever_retransmitted; /* Bit-per-buf: Set if ever retransmitted */
 
        /* The arrays we want to pack into as few cache lines as possible. */
        struct {
@@ -883,7 +921,9 @@ struct rxrpc_send_data_req {
        struct rxrpc_txqueue    *tq;            /* Tx queue segment holding first DATA */
        rxrpc_seq_t             seq;            /* Sequence of first data */
        int                     n;              /* Number of DATA packets to glue into jumbo */
+       bool                    retrans;        /* T if this is a retransmission */
        bool                    did_send;       /* T if did actually send */
+       bool                    tlp_probe;      /* T if this is a TLP probe */
        int /* enum rxrpc_txdata_trace */ trace;
 };
 
@@ -943,8 +983,9 @@ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial,
                        enum rxrpc_propose_ack_trace why);
 void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t,
                             enum rxrpc_propose_ack_trace);
-void rxrpc_resend(struct rxrpc_call *call, rxrpc_serial_t ack_serial, bool ping_response);
-
+void rxrpc_resend_tlp(struct rxrpc_call *call);
+void rxrpc_transmit_some_data(struct rxrpc_call *call, unsigned int limit,
+                             enum rxrpc_txdata_trace trace);
 bool rxrpc_input_call_event(struct rxrpc_call *call);
 
 /*
@@ -1123,6 +1164,32 @@ void rxrpc_congestion_degrade(struct rxrpc_call *);
 void rxrpc_input_call_packet(struct rxrpc_call *, struct sk_buff *);
 void rxrpc_implicit_end_call(struct rxrpc_call *, struct sk_buff *);
 
+/*
+ * input_rack.c
+ */
+void rxrpc_input_rack_one(struct rxrpc_call *call,
+                         struct rxrpc_ack_summary *summary,
+                         struct rxrpc_txqueue *tq,
+                         unsigned int ix);
+void rxrpc_input_rack(struct rxrpc_call *call,
+                     struct rxrpc_ack_summary *summary,
+                     struct rxrpc_txqueue *tq,
+                     unsigned long new_acks);
+void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call,
+                                         struct rxrpc_ack_summary *summary);
+ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now);
+void rxrpc_tlp_send_probe(struct rxrpc_call *call);
+void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary);
+void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by);
+
+/* Initialise TLP state [RFC8958 7.1]. */
+static inline void rxrpc_tlp_init(struct rxrpc_call *call)
+{
+       call->tlp_serial = 0;
+       call->tlp_seq = call->acks_hard_ack;
+       call->tlp_is_retrans = false;
+}
+
 /*
  * io_thread.c
  */
@@ -1402,6 +1469,11 @@ static inline u32 latest(u32 seq1, u32 seq2)
        return after(seq1, seq2) ? seq1 : seq2;
 }
 
+static inline bool rxrpc_seq_in_txq(const struct rxrpc_txqueue *tq, rxrpc_seq_t seq)
+{
+       return (seq & (RXRPC_NR_TXQUEUE - 1)) == tq->qbase;
+}
+
 static inline void rxrpc_queue_rx_call_packet(struct rxrpc_call *call, struct sk_buff *skb)
 {
        rxrpc_get_skb(skb, rxrpc_skb_get_call_rx);
@@ -1409,6 +1481,31 @@ static inline void rxrpc_queue_rx_call_packet(struct rxrpc_call *call, struct sk
        rxrpc_poke_call(call, rxrpc_call_poke_rx_packet);
 }
 
+/*
+ * Calculate how much space there is for transmitting more DATA packets.
+ */
+static inline unsigned int rxrpc_tx_window_space(const struct rxrpc_call *call)
+{
+       int winsize = umin(call->tx_winsize, call->cong_cwnd + call->cong_extra);
+       int transmitted = call->tx_top - call->tx_bottom;
+
+       return max(winsize - transmitted, 0);
+}
+
+static inline unsigned int rxrpc_left_out(const struct rxrpc_call *call)
+{
+       return call->acks_nr_sacks + call->tx_nr_lost;
+}
+
+/*
+ * Calculate the number of transmitted DATA packets assumed to be in flight
+ * [approx RFC6675].
+ */
+static inline unsigned int rxrpc_tx_in_flight(const struct rxrpc_call *call)
+{
+       return call->tx_nr_sent - rxrpc_left_out(call) + call->tx_nr_resent;
+}
+
 /*
  * debug tracing
  */
index 7af2755442512227703386cddbe76ab723d880dc..8e477f7f885014891363343f4fe153bf896e7579 100644 (file)
@@ -54,35 +54,21 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial,
        trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_delayed_ack);
 }
 
-/*
- * Handle congestion being detected by the retransmit timeout.
- */
-static void rxrpc_congestion_timeout(struct rxrpc_call *call)
-{
-       set_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags);
-}
-
 /*
  * Retransmit one or more packets.
  */
 static bool rxrpc_retransmit_data(struct rxrpc_call *call,
-                                 struct rxrpc_send_data_req *req,
-                                 ktime_t rto, bool skip_too_young)
+                                 struct rxrpc_send_data_req *req)
 {
        struct rxrpc_txqueue *tq = req->tq;
        unsigned int ix = req->seq & RXRPC_TXQ_MASK;
        struct rxrpc_txbuf *txb = tq->bufs[ix];
-       ktime_t xmit_ts, resend_at;
 
        _enter("%x,%x,%x,%x", tq->qbase, req->seq, ix, txb->debug_id);
 
-       xmit_ts = ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]);
-       resend_at = ktime_add(xmit_ts, rto);
-       trace_rxrpc_retransmit(call, req, txb, ktime_sub(resend_at, req->now));
-       if (skip_too_young && ktime_after(resend_at, req->now))
-               return false;
+       req->retrans = true;
+       trace_rxrpc_retransmit(call, req, txb);
 
-       __set_bit(ix, &tq->segment_retransmitted);
        txb->flags |= RXRPC_TXBUF_RESENT;
        rxrpc_send_data_packet(call, req);
        rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans);
@@ -97,133 +83,76 @@ static bool rxrpc_retransmit_data(struct rxrpc_call *call,
 /*
  * Perform retransmission of NAK'd and unack'd packets.
  */
-void rxrpc_resend(struct rxrpc_call *call, rxrpc_serial_t ack_serial, bool ping_response)
+static void rxrpc_resend(struct rxrpc_call *call)
 {
        struct rxrpc_send_data_req req = {
                .now    = ktime_get_real(),
                .trace  = rxrpc_txdata_retransmit,
        };
-       struct rxrpc_txqueue *tq = call->tx_queue;
-       ktime_t lowest_xmit_ts = KTIME_MAX;
-       ktime_t rto = rxrpc_get_rto_backoff(call, false);
-       bool unacked = false;
+       struct rxrpc_txqueue *tq;
 
        _enter("{%d,%d}", call->tx_bottom, call->tx_top);
 
-       if (call->tx_bottom == call->tx_top) {
-               call->resend_at = KTIME_MAX;
-               trace_rxrpc_timer_can(call, rxrpc_timer_trace_resend);
-               return;
-       }
+       trace_rxrpc_resend(call, call->acks_highest_serial);
 
-       trace_rxrpc_resend(call, ack_serial);
-
-       /* Scan the transmission queue, looking for explicitly NAK'd packets. */
-       do {
-               unsigned long naks = ~tq->segment_acked;
-               rxrpc_seq_t tq_top = tq->qbase + RXRPC_NR_TXQUEUE - 1;
+       /* Scan the transmission queue, looking for lost packets. */
+       for (tq = call->tx_queue; tq; tq = tq->next) {
+               unsigned long lost = tq->segment_lost;
 
                if (after(tq->qbase, call->tx_transmitted))
                        break;
 
-               if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE)
-                       naks &= (1UL << tq->nr_reported_acks) - 1;
-
                _debug("retr %16lx %u c=%08x [%x]",
                       tq->segment_acked, tq->nr_reported_acks, call->debug_id, tq->qbase);
-               _debug("nack %16lx", naks);
+               _debug("lost %16lx", lost);
 
-               while (naks) {
-                       unsigned int ix = __ffs(naks);
+               trace_rxrpc_resend_lost(call, tq, lost);
+               while (lost) {
+                       unsigned int ix = __ffs(lost);
                        struct rxrpc_txbuf *txb = tq->bufs[ix];
 
-                       __clear_bit(ix, &naks);
-                       if (after(txb->serial, call->acks_highest_serial))
-                               continue; /* Ack point not yet reached */
-
-                       rxrpc_see_txbuf(txb, rxrpc_txbuf_see_unacked);
+                       __clear_bit(ix, &lost);
+                       rxrpc_see_txbuf(txb, rxrpc_txbuf_see_lost);
 
                        req.tq  = tq;
                        req.seq = tq->qbase + ix;
                        req.n   = 1;
-                       rxrpc_retransmit_data(call, &req, rto, false);
-               }
-
-               /* Anything after the soft-ACK table up to and including
-                * ack.previousPacket will get ACK'd or NACK'd in due course,
-                * so don't worry about those here.  We do, however, need to
-                * consider retransmitting anything beyond that point.
-                */
-               if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE &&
-                   after(tq_top, call->acks_prev_seq)) {
-                       rxrpc_seq_t start = latest(call->acks_prev_seq,
-                                                  tq->qbase + tq->nr_reported_acks);
-                       rxrpc_seq_t stop = earliest(tq_top, call->tx_transmitted);
-
-                       _debug("unrep %x-%x", start, stop);
-                       for (rxrpc_seq_t seq = start; before_eq(seq, stop); seq++) {
-                               rxrpc_serial_t serial = tq->segment_serial[seq & RXRPC_TXQ_MASK];
-
-                               if (ping_response &&
-                                   before(serial, call->acks_highest_serial))
-                                       break; /* Wasn't accounted for by a more recent ping. */
-                               req.tq  = tq;
-                               req.seq = seq;
-                               req.n   = 1;
-                               if (rxrpc_retransmit_data(call, &req, rto, true))
-                                       unacked = true;
-                       }
+                       rxrpc_retransmit_data(call, &req);
                }
-
-               /* Work out the next retransmission timeout. */
-               if (ktime_before(tq->xmit_ts_base, lowest_xmit_ts)) {
-                       unsigned int lowest_us = UINT_MAX;
-
-                       for (int i = 0; i < RXRPC_NR_TXQUEUE; i++)
-                               if (!test_bit(i, &tq->segment_acked) &&
-                                   tq->segment_xmit_ts[i] < lowest_us)
-                                       lowest_us = tq->segment_xmit_ts[i];
-                       _debug("lowest[%x] %llx %u", tq->qbase, tq->xmit_ts_base, lowest_us);
-
-                       if (lowest_us != UINT_MAX) {
-                               ktime_t lowest_ns = ktime_add_us(tq->xmit_ts_base, lowest_us);
-
-                               if (ktime_before(lowest_ns, lowest_xmit_ts))
-                                       lowest_xmit_ts = lowest_ns;
-                       }
-               }
-       } while ((tq = tq->next));
-
-       if (lowest_xmit_ts < KTIME_MAX) {
-               ktime_t delay = rxrpc_get_rto_backoff(call, req.did_send);
-               ktime_t resend_at = ktime_add(lowest_xmit_ts, delay);
-
-               _debug("delay %llu %lld", delay, ktime_sub(resend_at, req.now));
-               call->resend_at = resend_at;
-               trace_rxrpc_timer_set(call, ktime_sub(resend_at, req.now),
-                                     rxrpc_timer_trace_resend_reset);
-       } else {
-               call->resend_at = KTIME_MAX;
-               trace_rxrpc_timer_can(call, rxrpc_timer_trace_resend);
        }
 
-       if (unacked)
-               rxrpc_congestion_timeout(call);
+       rxrpc_get_rto_backoff(call, req.did_send);
+       _leave("");
+}
 
-       /* If there was nothing that needed retransmission then it's likely
-        * that an ACK got lost somewhere.  Send a ping to find out instead of
-        * retransmitting data.
-        */
-       if (!req.did_send) {
-               ktime_t next_ping = ktime_add_us(call->acks_latest_ts,
-                                                call->srtt_us >> 3);
+/*
+ * Resend the highest-seq DATA packet so far transmitted for RACK-TLP [RFC8985 7.3].
+ */
+void rxrpc_resend_tlp(struct rxrpc_call *call)
+{
+       struct rxrpc_send_data_req req = {
+               .now            = ktime_get_real(),
+               .seq            = call->tx_transmitted,
+               .n              = 1,
+               .tlp_probe      = true,
+               .trace          = rxrpc_txdata_tlp_retransmit,
+       };
 
-               if (ktime_sub(next_ping, req.now) <= 0)
-                       rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
-                                      rxrpc_propose_ack_ping_for_0_retrans);
+       /* There's a chance it'll be on the tail segment of the queue. */
+       req.tq = READ_ONCE(call->tx_qtail);
+       if (req.tq &&
+           before(call->tx_transmitted, req.tq->qbase + RXRPC_NR_TXQUEUE)) {
+               rxrpc_retransmit_data(call, &req);
+               return;
        }
 
-       _leave("");
+       for (req.tq = call->tx_queue; req.tq; req.tq = req.tq->next) {
+               if (after_eq(call->tx_transmitted, req.tq->qbase) &&
+                   before(call->tx_transmitted, req.tq->qbase + RXRPC_NR_TXQUEUE)) {
+                       rxrpc_retransmit_data(call, &req);
+                       return;
+               }
+       }
 }
 
 /*
@@ -259,18 +188,10 @@ static void rxrpc_close_tx_phase(struct rxrpc_call *call)
        }
 }
 
-static unsigned int rxrpc_tx_window_space(struct rxrpc_call *call)
-{
-       int winsize = umin(call->tx_winsize, call->cong_cwnd + call->cong_extra);
-       int in_flight = call->tx_top - call->tx_bottom;
-
-       return max(winsize - in_flight, 0);
-}
-
 /*
- * Transmit some as-yet untransmitted data.
+ * Transmit some as-yet untransmitted data, to a maximum of the supplied limit.
  */
-static void rxrpc_transmit_fresh_data(struct rxrpc_call *call,
+static void rxrpc_transmit_fresh_data(struct rxrpc_call *call, unsigned int limit,
                                      enum rxrpc_txdata_trace trace)
 {
        int space = rxrpc_tx_window_space(call);
@@ -335,8 +256,8 @@ static void rxrpc_transmit_fresh_data(struct rxrpc_call *call,
        }
 }
 
-static void rxrpc_transmit_some_data(struct rxrpc_call *call,
-                                    enum rxrpc_txdata_trace trace)
+void rxrpc_transmit_some_data(struct rxrpc_call *call, unsigned int limit,
+                             enum rxrpc_txdata_trace trace)
 {
        switch (__rxrpc_call_state(call)) {
        case RXRPC_CALL_SERVER_ACK_REQUEST:
@@ -353,7 +274,7 @@ static void rxrpc_transmit_some_data(struct rxrpc_call *call,
                        rxrpc_inc_stat(call->rxnet, stat_tx_data_underflow);
                        return;
                }
-               rxrpc_transmit_fresh_data(call, trace);
+               rxrpc_transmit_fresh_data(call, limit, trace);
                break;
        default:
                return;
@@ -380,7 +301,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call)
 {
        struct sk_buff *skb;
        ktime_t now, t;
-       bool resend = false, did_receive = false, saw_ack = false;
+       bool did_receive = false, saw_ack = false;
        s32 abort_code;
 
        rxrpc_see_call(call, rxrpc_call_see_input);
@@ -398,21 +319,33 @@ bool rxrpc_input_call_event(struct rxrpc_call *call)
                goto out;
        }
 
-       while ((skb = __skb_dequeue(&call->rx_queue))) {
-               struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+       do {
+               skb = __skb_dequeue(&call->rx_queue);
+               if (skb) {
+                       struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+                       if (__rxrpc_call_is_complete(call) ||
+                           skb->mark == RXRPC_SKB_MARK_ERROR) {
+                               rxrpc_free_skb(skb, rxrpc_skb_put_call_rx);
+                               goto out;
+                       }
+
+                       saw_ack |= sp->hdr.type == RXRPC_PACKET_TYPE_ACK;
 
-               if (__rxrpc_call_is_complete(call) ||
-                   skb->mark == RXRPC_SKB_MARK_ERROR) {
+                       rxrpc_input_call_packet(call, skb);
                        rxrpc_free_skb(skb, rxrpc_skb_put_call_rx);
-                       goto out;
+                       did_receive = true;
                }
 
-               saw_ack |= sp->hdr.type == RXRPC_PACKET_TYPE_ACK;
+               t = ktime_sub(call->rack_timo_at, ktime_get_real());
+               if (t <= 0) {
+                       trace_rxrpc_timer_exp(call, t,
+                                             rxrpc_timer_trace_rack_off + call->rack_timer_mode);
+                       call->rack_timo_at = KTIME_MAX;
+                       rxrpc_rack_timer_expired(call, t);
+               }
 
-               rxrpc_input_call_packet(call, skb);
-               rxrpc_free_skb(skb, rxrpc_skb_put_call_rx);
-               did_receive = true;
-       }
+       } while (!skb_queue_empty(&call->rx_queue));
 
        /* If we see our async-event poke, check for timeout trippage. */
        now = ktime_get_real();
@@ -445,13 +378,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call)
                               rxrpc_propose_ack_delayed_ack);
        }
 
-       t = ktime_sub(call->ack_lost_at, now);
-       if (t <= 0) {
-               trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_lost_ack);
-               call->ack_lost_at = KTIME_MAX;
-               set_bit(RXRPC_CALL_EV_ACK_LOST, &call->events);
-       }
-
        t = ktime_sub(call->ping_at, now);
        if (t <= 0) {
                trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_ping);
@@ -460,15 +386,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call)
                               rxrpc_propose_ack_ping_for_keepalive);
        }
 
-       t = ktime_sub(call->resend_at, now);
-       if (t <= 0) {
-               trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_resend);
-               call->resend_at = KTIME_MAX;
-               resend = true;
-       }
-
-       rxrpc_transmit_some_data(call, rxrpc_txdata_new_data);
-
        now = ktime_get_real();
        t = ktime_sub(call->keepalive_at, now);
        if (t <= 0) {
@@ -478,21 +395,30 @@ bool rxrpc_input_call_event(struct rxrpc_call *call)
                               rxrpc_propose_ack_ping_for_keepalive);
        }
 
+       if (test_and_clear_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events))
+               rxrpc_send_initial_ping(call);
+
+       rxrpc_transmit_some_data(call, UINT_MAX, rxrpc_txdata_new_data);
+
        if (saw_ack)
                rxrpc_congestion_degrade(call);
 
-       if (test_and_clear_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events))
-               rxrpc_send_initial_ping(call);
+       if (did_receive &&
+           (__rxrpc_call_state(call) == RXRPC_CALL_CLIENT_SEND_REQUEST ||
+            __rxrpc_call_state(call) == RXRPC_CALL_SERVER_SEND_REPLY)) {
+               t = ktime_sub(call->rack_timo_at, ktime_get_real());
+               trace_rxrpc_rack(call, t);
+       }
 
        /* Process events */
        if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events))
                rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
                               rxrpc_propose_ack_ping_for_lost_ack);
 
-       if (resend &&
+       if (call->tx_nr_lost > 0 &&
            __rxrpc_call_state(call) != RXRPC_CALL_CLIENT_RECV_REPLY &&
            !test_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags))
-               rxrpc_resend(call, 0, false);
+               rxrpc_resend(call);
 
        if (test_and_clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags))
                rxrpc_send_ACK(call, RXRPC_ACK_IDLE, 0,
@@ -520,8 +446,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call)
                set(call->expect_req_by);
                set(call->expect_rx_by);
                set(call->delay_ack_at);
-               set(call->ack_lost_at);
-               set(call->resend_at);
+               set(call->rack_timo_at);
                set(call->keepalive_at);
                set(call->ping_at);
 
index fb4ee0d2e9e1600f0bb672303a3d79a09f991aa8..5a543c3f6fb0856c745e9136b1476a0b872990bc 100644 (file)
@@ -160,8 +160,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
        call->ackr_window       = 1;
        call->ackr_wtop         = 1;
        call->delay_ack_at      = KTIME_MAX;
-       call->ack_lost_at       = KTIME_MAX;
-       call->resend_at         = KTIME_MAX;
+       call->rack_timo_at      = KTIME_MAX;
        call->ping_at           = KTIME_MAX;
        call->keepalive_at      = KTIME_MAX;
        call->expect_rx_by      = KTIME_MAX;
index 9f308bd512e93fde6db9aafc3eca4997bd1b3233..4974b5accafa3372238127b5fccfff05bc8215cf 100644 (file)
@@ -27,13 +27,13 @@ static void rxrpc_proto_abort(struct rxrpc_call *call, rxrpc_seq_t seq,
 }
 
 /*
- * Do TCP-style congestion management [RFC 5681].
+ * Do TCP-style congestion management [RFC5681].
  */
 static void rxrpc_congestion_management(struct rxrpc_call *call,
                                        struct rxrpc_ack_summary *summary)
 {
        summary->change = rxrpc_cong_no_change;
-       summary->in_flight = (call->tx_top - call->tx_bottom) - call->acks_nr_sacks;
+       summary->in_flight = rxrpc_tx_in_flight(call);
 
        if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) {
                summary->retrans_timeo = true;
@@ -106,9 +106,12 @@ static void rxrpc_congestion_management(struct rxrpc_call *call,
                call->cong_extra = 0;
                call->cong_dup_acks = 0;
                summary->need_retransmit = true;
+               summary->in_fast_or_rto_recovery = true;
                goto out;
 
        case RXRPC_CA_FAST_RETRANSMIT:
+               rxrpc_tlp_init(call);
+               summary->in_fast_or_rto_recovery = true;
                if (!summary->new_low_snack) {
                        if (summary->nr_new_sacks == 0)
                                call->cong_cwnd += 1;
@@ -121,8 +124,10 @@ static void rxrpc_congestion_management(struct rxrpc_call *call,
                } else {
                        summary->change = rxrpc_cong_progress;
                        call->cong_cwnd = call->cong_ssthresh;
-                       if (call->acks_nr_snacks == 0)
+                       if (call->acks_nr_snacks == 0) {
+                               summary->exiting_fast_or_rto_recovery = true;
                                goto resume_normality;
+                       }
                }
                goto out;
 
@@ -171,7 +176,7 @@ send_extra_data:
  */
 void rxrpc_congestion_degrade(struct rxrpc_call *call)
 {
-       ktime_t rtt, now;
+       ktime_t rtt, now, time_since;
 
        if (call->cong_ca_state != RXRPC_CA_SLOW_START &&
            call->cong_ca_state != RXRPC_CA_CONGEST_AVOIDANCE)
@@ -181,10 +186,11 @@ void rxrpc_congestion_degrade(struct rxrpc_call *call)
 
        rtt = ns_to_ktime(call->srtt_us * (NSEC_PER_USEC / 8));
        now = ktime_get_real();
-       if (!ktime_before(ktime_add(call->tx_last_sent, rtt), now))
+       time_since = ktime_sub(now, call->tx_last_sent);
+       if (ktime_before(time_since, rtt))
                return;
 
-       trace_rxrpc_reset_cwnd(call, now);
+       trace_rxrpc_reset_cwnd(call, time_since, rtt);
        rxrpc_inc_stat(call->rxnet, stat_tx_data_cwnd_reset);
        call->tx_last_sent = now;
        call->cong_ca_state = RXRPC_CA_SLOW_START;
@@ -200,11 +206,11 @@ static void rxrpc_add_data_rtt_sample(struct rxrpc_call *call,
                                      struct rxrpc_txqueue *tq,
                                      int ix)
 {
+       ktime_t xmit_ts = ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]);
+
        rxrpc_call_add_rtt(call, rxrpc_rtt_rx_data_ack, -1,
                           summary->acked_serial, summary->ack_serial,
-                          ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]),
-                          call->acks_latest_ts);
-       summary->rtt_sample_avail = false;
+                          xmit_ts, call->acks_latest_ts);
        __clear_bit(ix, &tq->rtt_samples); /* Prevent repeat RTT sample */
 }
 
@@ -216,7 +222,7 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
 {
        struct rxrpc_txqueue *tq = call->tx_queue;
        rxrpc_seq_t seq = call->tx_bottom + 1;
-       bool rot_last = false;
+       bool rot_last = false, trace = false;
 
        _enter("%x,%x", call->tx_bottom, to);
 
@@ -250,14 +256,16 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
                        rot_last = true;
                }
 
-               if (summary->rtt_sample_avail &&
-                   summary->acked_serial == tq->segment_serial[ix] &&
+               if (summary->acked_serial == tq->segment_serial[ix] &&
                    test_bit(ix, &tq->rtt_samples))
                        rxrpc_add_data_rtt_sample(call, summary, tq, ix);
 
                if (ix == tq->nr_reported_acks) {
                        /* Packet directly hard ACK'd. */
                        tq->nr_reported_acks++;
+                       rxrpc_input_rack_one(call, summary, tq, ix);
+                       if (seq == call->tlp_seq)
+                               summary->tlp_probe_acked = true;
                        summary->nr_new_hacks++;
                        __set_bit(ix, &tq->segment_acked);
                        trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_hack);
@@ -268,11 +276,21 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
                } else {
                        /* Soft NAK -> hard ACK. */
                        call->acks_nr_snacks--;
+                       rxrpc_input_rack_one(call, summary, tq, ix);
+                       if (seq == call->tlp_seq)
+                               summary->tlp_probe_acked = true;
                        summary->nr_new_hacks++;
                        __set_bit(ix, &tq->segment_acked);
                        trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_snak);
                }
 
+               call->tx_nr_sent--;
+               if (__test_and_clear_bit(ix, &tq->segment_lost))
+                       call->tx_nr_lost--;
+               if (__test_and_clear_bit(ix, &tq->segment_retransmitted))
+                       call->tx_nr_resent--;
+               __clear_bit(ix, &tq->ever_retransmitted);
+
                rxrpc_put_txbuf(tq->bufs[ix], rxrpc_txbuf_put_rotated);
                tq->bufs[ix] = NULL;
 
@@ -282,7 +300,10 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
                                           rxrpc_txqueue_rotate));
 
                seq++;
+               trace = true;
                if (!(seq & RXRPC_TXQ_MASK)) {
+                       trace_rxrpc_rack_update(call, summary);
+                       trace = false;
                        prefetch(tq->next);
                        if (tq != call->tx_qtail) {
                                call->tx_qbase += RXRPC_NR_TXQUEUE;
@@ -299,6 +320,9 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
 
        } while (before_eq(seq, to));
 
+       if (trace)
+               trace_rxrpc_rack_update(call, summary);
+
        if (rot_last) {
                set_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags);
                if (tq) {
@@ -325,8 +349,10 @@ static void rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun,
 {
        ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags));
 
-       call->resend_at = KTIME_MAX;
-       trace_rxrpc_timer_can(call, rxrpc_timer_trace_resend);
+       call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF;
+       call->rack_timo_at = KTIME_MAX;
+       trace_rxrpc_rack_timer(call, 0, false);
+       trace_rxrpc_timer_can(call, rxrpc_timer_trace_rack_off + call->rack_timer_mode);
 
        switch (__rxrpc_call_state(call)) {
        case RXRPC_CALL_CLIENT_SEND_REQUEST:
@@ -842,10 +868,13 @@ static void rxrpc_input_soft_ack_tq(struct rxrpc_call *call,
                                    rxrpc_seq_t seq,
                                    rxrpc_seq_t *lowest_nak)
 {
-       unsigned long old_reported, flipped, new_acks, a_to_n, n_to_a;
+       unsigned long old_reported = 0, flipped, new_acks = 0;
+       unsigned long a_to_n, n_to_a = 0;
        int new, a, n;
 
-       old_reported = ~0UL >> (RXRPC_NR_TXQUEUE - tq->nr_reported_acks);
+       if (tq->nr_reported_acks > 0)
+               old_reported = ~0UL >> (RXRPC_NR_TXQUEUE - tq->nr_reported_acks);
+
        _enter("{%x,%lx,%d},%lx,%d,%x",
               tq->qbase, tq->segment_acked, tq->nr_reported_acks,
               extracted_acks, nr_reported, seq);
@@ -898,6 +927,18 @@ static void rxrpc_input_soft_ack_tq(struct rxrpc_call *call,
                if (before(lowest, *lowest_nak))
                        *lowest_nak = lowest;
        }
+
+       if (summary->acked_serial)
+               rxrpc_input_soft_rtt(call, summary, tq);
+
+       new_acks |= n_to_a;
+       if (new_acks)
+               rxrpc_input_rack(call, summary, tq, new_acks);
+
+       if (call->tlp_serial &&
+           rxrpc_seq_in_txq(tq, call->tlp_seq) &&
+           test_bit(call->tlp_seq - tq->qbase, &new_acks))
+               summary->tlp_probe_acked = true;
 }
 
 /*
@@ -940,8 +981,6 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call,
 
                _debug("bound %16lx %u", extracted, nr);
 
-               if (summary->rtt_sample_avail)
-                       rxrpc_input_soft_rtt(call, summary, tq);
                rxrpc_input_soft_ack_tq(call, summary, tq, extracted, RXRPC_NR_TXQUEUE,
                                        seq - RXRPC_NR_TXQUEUE, &lowest_nak);
                extracted = ~0UL;
@@ -1063,7 +1102,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
        /* Discard any out-of-order or duplicate ACKs (outside lock). */
        if (!rxrpc_is_ack_valid(call, hard_ack, prev_pkt)) {
                trace_rxrpc_rx_discard_ack(call, summary.ack_serial, hard_ack, prev_pkt);
-               goto send_response;
+               goto send_response; /* Still respond if requested. */
        }
 
        trailer.maxMTU = 0;
@@ -1079,14 +1118,19 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
        call->acks_hard_ack = hard_ack;
        call->acks_prev_seq = prev_pkt;
 
-       switch (summary.ack_reason) {
-       case RXRPC_ACK_PING:
-               break;
-       default:
-               if (summary.acked_serial &&
-                   after(summary.acked_serial, call->acks_highest_serial))
-                       call->acks_highest_serial = summary.acked_serial;
-               break;
+       if (summary.acked_serial) {
+               switch (summary.ack_reason) {
+               case RXRPC_ACK_PING_RESPONSE:
+                       rxrpc_complete_rtt_probe(call, call->acks_latest_ts,
+                                                summary.acked_serial, summary.ack_serial,
+                                                rxrpc_rtt_rx_ping_response);
+                       break;
+               default:
+                       if (after(summary.acked_serial, call->acks_highest_serial))
+                               call->acks_highest_serial = summary.acked_serial;
+                       summary.rtt_sample_avail = true;
+                       break;
+               }
        }
 
        /* Parse rwind and mtu sizes if provided. */
@@ -1096,15 +1140,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
        if (hard_ack + 1 == 0)
                return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_zero);
 
-       if (summary.acked_serial) {
-               if (summary.ack_reason == RXRPC_ACK_PING_RESPONSE)
-                       rxrpc_complete_rtt_probe(call, call->acks_latest_ts,
-                                                summary.acked_serial, summary.ack_serial,
-                                                rxrpc_rtt_rx_ping_response);
-               else
-                       summary.rtt_sample_avail = true;
-       }
-
        /* Ignore ACKs unless we are or have just been transmitting. */
        switch (__rxrpc_call_state(call)) {
        case RXRPC_CALL_CLIENT_SEND_REQUEST:
@@ -1141,10 +1176,14 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
                rxrpc_propose_ping(call, summary.ack_serial,
                                   rxrpc_propose_ack_ping_for_lost_reply);
 
+       /* Drive the congestion management algorithm first and then RACK-TLP as
+        * the latter depends on the state/change in state in the former.
+        */
        rxrpc_congestion_management(call, &summary);
-       if (summary.need_retransmit)
-               rxrpc_resend(call, summary.ack_serial,
-                            summary.ack_reason == RXRPC_ACK_PING_RESPONSE);
+       rxrpc_rack_detect_loss_and_arm_timer(call, &summary);
+       rxrpc_tlp_process_ack(call, &summary);
+       if (call->tlp_serial && after_eq(summary.acked_serial, call->tlp_serial))
+               call->tlp_serial = 0;
 
 send_response:
        if (summary.ack_reason == RXRPC_ACK_PING)
diff --git a/net/rxrpc/input_rack.c b/net/rxrpc/input_rack.c
new file mode 100644 (file)
index 0000000..13c3712
--- /dev/null
@@ -0,0 +1,418 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* RACK-TLP [RFC8958] Implementation
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "ar-internal.h"
+
+static bool rxrpc_rack_sent_after(ktime_t t1, rxrpc_seq_t seq1,
+                                 ktime_t t2, rxrpc_seq_t seq2)
+{
+       if (ktime_after(t1, t2))
+               return true;
+       return t1 == t2 && after(seq1, seq2);
+}
+
+/*
+ * Mark a packet lost.
+ */
+static void rxrpc_rack_mark_lost(struct rxrpc_call *call,
+                                struct rxrpc_txqueue *tq, unsigned int ix)
+{
+       if (__test_and_set_bit(ix, &tq->segment_lost)) {
+               if (__test_and_clear_bit(ix, &tq->segment_retransmitted))
+                       call->tx_nr_resent--;
+       } else {
+               call->tx_nr_lost++;
+       }
+       tq->segment_xmit_ts[ix] = UINT_MAX;
+}
+
+/*
+ * Get the transmission time of a packet in the Tx queue.
+ */
+static ktime_t rxrpc_get_xmit_ts(const struct rxrpc_txqueue *tq, unsigned int ix)
+{
+       if (tq->segment_xmit_ts[ix] == UINT_MAX)
+               return KTIME_MAX;
+       return ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]);
+}
+
+/*
+ * Get a bitmask of nack bits for a queue segment and mask off any that aren't
+ * yet reported.
+ */
+static unsigned long rxrpc_tq_nacks(const struct rxrpc_txqueue *tq)
+{
+       unsigned long nacks = ~tq->segment_acked;
+
+       if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE)
+               nacks &= (1UL << tq->nr_reported_acks) - 1;
+       return nacks;
+}
+
+/*
+ * Update the RACK state for the most recently sent packet that has been
+ * delivered [RFC8958 6.2 Step 2].
+ */
+static void rxrpc_rack_update(struct rxrpc_call *call,
+                             struct rxrpc_ack_summary *summary,
+                             struct rxrpc_txqueue *tq,
+                             unsigned int ix)
+{
+       rxrpc_seq_t seq = tq->qbase + ix;
+       ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix);
+       ktime_t rtt = ktime_sub(call->acks_latest_ts, xmit_ts);
+
+       if (__test_and_clear_bit(ix, &tq->segment_lost))
+               call->tx_nr_lost--;
+
+       if (test_bit(ix, &tq->segment_retransmitted)) {
+               /* Use Rx.serial instead of TCP.ACK.ts_option.echo_reply. */
+               if (before(call->acks_highest_serial, tq->segment_serial[ix]))
+                       return;
+               if (rtt < minmax_get(&call->min_rtt))
+                       return;
+       }
+
+       /* The RACK algorithm requires the segment ACKs to be traversed in
+        * order of segment transmission - but the only thing this seems to
+        * matter for is that RACK.rtt is set to the rtt of the most recently
+        * transmitted segment.  We should be able to achieve the same by only
+        * setting RACK.rtt if the xmit time is greater.
+        */
+       if (ktime_after(xmit_ts, call->rack_rtt_ts)) {
+               call->rack_rtt    = rtt;
+               call->rack_rtt_ts = xmit_ts;
+       }
+
+       if (rxrpc_rack_sent_after(xmit_ts, seq, call->rack_xmit_ts, call->rack_end_seq)) {
+               call->rack_rtt = rtt;
+               call->rack_xmit_ts = xmit_ts;
+               call->rack_end_seq = seq;
+       }
+}
+
+/*
+ * Detect data segment reordering [RFC8958 6.2 Step 3].
+ */
+static void rxrpc_rack_detect_reordering(struct rxrpc_call *call,
+                                        struct rxrpc_ack_summary *summary,
+                                        struct rxrpc_txqueue *tq,
+                                        unsigned int ix)
+{
+       rxrpc_seq_t seq = tq->qbase + ix;
+
+       /* Track the highest sequence number so far ACK'd.  This is not
+        * necessarily the same as ack.firstPacket + ack.nAcks - 1 as the peer
+        * could put a NACK in the last SACK slot.
+        */
+       if (after(seq, call->rack_fack))
+               call->rack_fack = seq;
+       else if (before(seq, call->rack_fack) &&
+                test_bit(ix, &tq->segment_retransmitted))
+               call->rack_reordering_seen = true;
+}
+
+void rxrpc_input_rack_one(struct rxrpc_call *call,
+                         struct rxrpc_ack_summary *summary,
+                         struct rxrpc_txqueue *tq,
+                         unsigned int ix)
+{
+       rxrpc_rack_update(call, summary, tq, ix);
+       rxrpc_rack_detect_reordering(call, summary, tq, ix);
+}
+
+void rxrpc_input_rack(struct rxrpc_call *call,
+                     struct rxrpc_ack_summary *summary,
+                     struct rxrpc_txqueue *tq,
+                     unsigned long new_acks)
+{
+       while (new_acks) {
+               unsigned int ix = __ffs(new_acks);
+
+               __clear_bit(ix, &new_acks);
+               rxrpc_input_rack_one(call, summary, tq, ix);
+       }
+
+       trace_rxrpc_rack_update(call, summary);
+}
+
+/*
+ * Update the reordering window [RFC8958 6.2 Step 4].  Returns the updated
+ * duration of the reordering window.
+ *
+ * Note that the Rx protocol doesn't have a 'DSACK option' per se, but ACKs can
+ * be given a 'DUPLICATE' reason with the serial number referring to the
+ * duplicated DATA packet.  Rx does not inform as to whether this was a
+ * reception of the same packet twice or of a retransmission of a packet we
+ * already received (though this could be determined by the transmitter based
+ * on the serial number).
+ */
+static ktime_t rxrpc_rack_update_reo_wnd(struct rxrpc_call *call,
+                                        struct rxrpc_ack_summary *summary)
+{
+       rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */
+       rxrpc_seq_t snd_nxt = call->tx_transmitted + 1; /* Next seq to be sent */
+       bool have_dsack_option = summary->ack_reason == RXRPC_ACK_DUPLICATE;
+       int dup_thresh = 3;
+
+       /* DSACK-based reordering window adaptation */
+       if (!call->rack_dsack_round_none &&
+           after_eq(snd_una, call->rack_dsack_round))
+               call->rack_dsack_round_none = true;
+
+       /* Grow the reordering window per round that sees DSACK.  Reset the
+        * window after 16 DSACK-free recoveries.
+        */
+       if (call->rack_dsack_round_none && have_dsack_option) {
+               call->rack_dsack_round_none = false;
+               call->rack_dsack_round = snd_nxt;
+               call->rack_reo_wnd_mult++;
+               call->rack_reo_wnd_persist = 16;
+       } else if (summary->exiting_fast_or_rto_recovery) {
+               call->rack_reo_wnd_persist--;
+               if (call->rack_reo_wnd_persist <= 0)
+                       call->rack_reo_wnd_mult = 1;
+       }
+
+       if (!call->rack_reordering_seen) {
+               if (summary->in_fast_or_rto_recovery)
+                       return 0;
+               if (call->acks_nr_sacks >= dup_thresh)
+                       return 0;
+       }
+
+       return us_to_ktime(umin(call->rack_reo_wnd_mult * minmax_get(&call->min_rtt) / 4,
+                               call->srtt_us >> 3));
+}
+
+/*
+ * Detect losses [RFC8958 6.2 Step 5].
+ */
+static ktime_t rxrpc_rack_detect_loss(struct rxrpc_call *call,
+                                     struct rxrpc_ack_summary *summary)
+{
+       struct rxrpc_txqueue *tq;
+       ktime_t timeout = 0, lost_after, now = ktime_get_real();
+
+       call->rack_reo_wnd = rxrpc_rack_update_reo_wnd(call, summary);
+       lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd);
+       trace_rxrpc_rack_scan_loss(call);
+
+       for (tq = call->tx_queue; tq; tq = tq->next) {
+               unsigned long nacks = rxrpc_tq_nacks(tq);
+
+               if (after(tq->qbase, call->tx_transmitted))
+                       break;
+               trace_rxrpc_rack_scan_loss_tq(call, tq, nacks);
+
+               /* Skip ones marked lost but not yet retransmitted */
+               nacks &= ~tq->segment_lost | tq->segment_retransmitted;
+
+               while (nacks) {
+                       unsigned int ix = __ffs(nacks);
+                       rxrpc_seq_t seq = tq->qbase + ix;
+                       ktime_t remaining;
+                       ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix);
+
+                       __clear_bit(ix, &nacks);
+
+                       if (rxrpc_rack_sent_after(call->rack_xmit_ts, call->rack_end_seq,
+                                                 xmit_ts, seq)) {
+                               remaining = ktime_sub(ktime_add(xmit_ts, lost_after), now);
+                               if (remaining <= 0) {
+                                       rxrpc_rack_mark_lost(call, tq, ix);
+                                       trace_rxrpc_rack_detect_loss(call, summary, seq);
+                               } else {
+                                       timeout = max(remaining, timeout);
+                               }
+                       }
+               }
+       }
+
+       return timeout;
+}
+
+/*
+ * Detect losses and set a timer to retry the detection [RFC8958 6.2 Step 5].
+ */
+void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call,
+                                         struct rxrpc_ack_summary *summary)
+{
+       ktime_t timeout = rxrpc_rack_detect_loss(call, summary);
+
+       if (timeout) {
+               call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RACK_REORDER;
+               call->rack_timo_at = ktime_add(ktime_get_real(), timeout);
+               trace_rxrpc_rack_timer(call, timeout, false);
+               trace_rxrpc_timer_set(call, timeout, rxrpc_timer_trace_rack_reo);
+       }
+}
+
+/*
+ * Handle RACK-TLP RTO expiration [RFC8958 6.3].
+ */
+static void rxrpc_rack_mark_losses_on_rto(struct rxrpc_call *call)
+{
+       struct rxrpc_txqueue *tq;
+       rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */
+       ktime_t lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd);
+       ktime_t deadline = ktime_sub(ktime_get_real(), lost_after);
+
+       for (tq = call->tx_queue; tq; tq = tq->next) {
+               unsigned long unacked = ~tq->segment_acked;
+
+               trace_rxrpc_rack_mark_loss_tq(call, tq);
+               while (unacked) {
+                       unsigned int ix = __ffs(unacked);
+                       rxrpc_seq_t seq = tq->qbase + ix;
+                       ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix);
+
+                       if (after(seq, call->tx_transmitted))
+                               return;
+                       __clear_bit(ix, &unacked);
+
+                       if (seq == snd_una ||
+                           ktime_before(xmit_ts, deadline))
+                               rxrpc_rack_mark_lost(call, tq, ix);
+               }
+       }
+}
+
+/*
+ * Calculate the TLP loss probe timeout (PTO) [RFC8958 7.2].
+ */
+ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now)
+{
+       unsigned int flight_size = rxrpc_tx_in_flight(call);
+       ktime_t rto_at = ktime_add(call->tx_last_sent,
+                                  rxrpc_get_rto_backoff(call, false));
+       ktime_t pto;
+
+       if (call->rtt_count > 0) {
+               /* Use 2*SRTT as the timeout. */
+               pto = ns_to_ktime(call->srtt_us * NSEC_PER_USEC / 4);
+               if (flight_size)
+                       pto = ktime_add(pto, call->tlp_max_ack_delay);
+       } else {
+               pto = NSEC_PER_SEC;
+       }
+
+       if (ktime_after(ktime_add(now, pto), rto_at))
+               pto = ktime_sub(rto_at, now);
+       return pto;
+}
+
+/*
+ * Send a TLP loss probe on PTO expiration [RFC8958 7.3].
+ */
+void rxrpc_tlp_send_probe(struct rxrpc_call *call)
+{
+       unsigned int in_flight = rxrpc_tx_in_flight(call);
+
+       if (after_eq(call->acks_hard_ack, call->tx_transmitted))
+               return; /* Everything we transmitted has been acked. */
+
+       /* There must be no other loss probe still in flight and we need to
+        * have taken a new RTT sample since last probe or the start of
+        * connection.
+        */
+       if (!call->tlp_serial &&
+           call->tlp_rtt_taken != call->rtt_taken) {
+               call->tlp_is_retrans = false;
+               if (after(call->send_top, call->tx_transmitted) &&
+                   rxrpc_tx_window_space(call) > 0) {
+                       /* Transmit the lowest-sequence unsent DATA */
+                       call->tx_last_serial = 0;
+                       rxrpc_transmit_some_data(call, 1, rxrpc_txdata_tlp_new_data);
+                       call->tlp_serial = call->tx_last_serial;
+                       call->tlp_seq = call->tx_transmitted;
+                       trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_transmit_new);
+                       in_flight = rxrpc_tx_in_flight(call);
+               } else {
+                       /* Retransmit the highest-sequence DATA sent */
+                       call->tx_last_serial = 0;
+                       rxrpc_resend_tlp(call);
+                       call->tlp_is_retrans = true;
+                       trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_retransmit);
+               }
+       } else {
+               trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_busy);
+       }
+
+       if (in_flight != 0) {
+               ktime_t rto = rxrpc_get_rto_backoff(call, false);
+
+               call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RTO;
+               call->rack_timo_at = ktime_add(ktime_get_real(), rto);
+               trace_rxrpc_rack_timer(call, rto, false);
+               trace_rxrpc_timer_set(call, rto, rxrpc_timer_trace_rack_rto);
+       }
+}
+
+/*
+ * Detect losses using the ACK of a TLP loss probe [RFC8958 7.4].
+ */
+void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary)
+{
+       if (!call->tlp_serial || after(call->tlp_seq, call->acks_hard_ack))
+               return;
+
+       if (!call->tlp_is_retrans) {
+               /* TLP of new data delivered */
+               trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_new_data);
+               call->tlp_serial = 0;
+       } else if (summary->ack_reason == RXRPC_ACK_DUPLICATE &&
+                  summary->acked_serial == call->tlp_serial) {
+               /* General Case: Detected packet losses using RACK [7.4.1] */
+               trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_dup_acked);
+               call->tlp_serial = 0;
+       } else if (after(call->acks_hard_ack, call->tlp_seq)) {
+               /* Repaired the single loss */
+               trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_hard_beyond);
+               call->tlp_serial = 0;
+               // TODO: Invoke congestion control to react to the loss
+               // event the probe has repaired
+       } else if (summary->tlp_probe_acked) {
+               trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_acked);
+               /* Special Case: Detected a single loss repaired by the loss
+                * probe [7.4.2]
+                */
+               call->tlp_serial = 0;
+       } else {
+               trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_incomplete);
+       }
+}
+
+/*
+ * Handle RACK timer expiration; returns true to request a resend.
+ */
+void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by)
+{
+       struct rxrpc_ack_summary summary = {};
+       enum rxrpc_rack_timer_mode mode = call->rack_timer_mode;
+
+       trace_rxrpc_rack_timer(call, overran_by, true);
+       call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF;
+
+       switch (mode) {
+       case RXRPC_CALL_RACKTIMER_RACK_REORDER:
+               rxrpc_rack_detect_loss_and_arm_timer(call, &summary);
+               break;
+       case RXRPC_CALL_RACKTIMER_TLP_PTO:
+               rxrpc_tlp_send_probe(call);
+               break;
+       case RXRPC_CALL_RACKTIMER_RTO:
+               // Might need to poke the congestion algo in some way
+               rxrpc_rack_mark_losses_on_rto(call);
+               break;
+       //case RXRPC_CALL_RACKTIMER_ZEROWIN:
+       default:
+               pr_warn("Unexpected rack timer %u", call->rack_timer_mode);
+       }
+}
index fbacf2056f643f5f9fc5f45626fca74141fde29a..2925c7fc82cfb47e4cbe86f95df4c252518b9320 100644 (file)
@@ -470,6 +470,7 @@ int rxrpc_io_thread(void *data)
                        spin_lock_irq(&local->rx_queue.lock);
                        skb_queue_splice_tail_init(&local->rx_queue, &rx_queue);
                        spin_unlock_irq(&local->rx_queue.lock);
+                       trace_rxrpc_iothread_rx(local, skb_queue_len(&rx_queue));
                }
 
                /* Distribute packets and errors. */
index f934551a9b1c69c1887a435a510a46fd99c3cee6..6f7a125d6e9085e519f7c9cc6fc39c2b401887ba 100644 (file)
@@ -542,12 +542,14 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se
        unsigned int xmit_ts;
        rxrpc_seq_t seq = req->seq;
        size_t len = 0;
+       bool start_tlp = false;
 
        trace_rxrpc_tq(call, tq, seq, rxrpc_tq_transmit);
 
        /* Each transmission of a Tx packet needs a new serial number */
        serial = rxrpc_get_next_serials(call->conn, req->n);
 
+       call->tx_last_serial = serial + req->n - 1;
        call->tx_last_sent = req->now;
        xmit_ts = rxrpc_prepare_txqueue(tq, req);
        prefetch(tq->next);
@@ -557,6 +559,18 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se
                struct rxrpc_txbuf *txb = tq->bufs[seq & RXRPC_TXQ_MASK];
 
                _debug("prep[%u] tq=%x q=%x", i, tq->qbase, seq);
+
+               /* Record (re-)transmission for RACK [RFC8985 6.1]. */
+               if (__test_and_clear_bit(ix, &tq->segment_lost))
+                       call->tx_nr_lost--;
+               if (req->retrans) {
+                       __set_bit(ix, &tq->ever_retransmitted);
+                       __set_bit(ix, &tq->segment_retransmitted);
+                       call->tx_nr_resent++;
+               } else {
+                       call->tx_nr_sent++;
+                       start_tlp = true;
+               }
                tq->segment_xmit_ts[ix] = xmit_ts;
                tq->segment_serial[ix] = serial;
                if (i + 1 == req->n)
@@ -576,11 +590,24 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se
        }
 
        /* Set timeouts */
-       if (call->rtt_count > 1) {
-               ktime_t delay = rxrpc_get_rto_backoff(call, false);
+       if (req->tlp_probe) {
+               /* Sending TLP loss probe [RFC8985 7.3]. */
+               call->tlp_serial = serial - 1;
+               call->tlp_seq = seq - 1;
+       } else if (start_tlp) {
+               /* Schedule TLP loss probe [RFC8985 7.2]. */
+               ktime_t pto;
+
+               if (!test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags))
+                        /* The first packet may take longer to elicit a response. */
+                       pto = NSEC_PER_SEC;
+               else
+                       pto = rxrpc_tlp_calc_pto(call, req->now);
 
-               call->ack_lost_at = ktime_add(req->now, delay);
-               trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_lost_ack);
+               call->rack_timer_mode = RXRPC_CALL_RACKTIMER_TLP_PTO;
+               call->rack_timo_at = ktime_add(req->now, pto);
+               trace_rxrpc_rack_timer(call, pto, false);
+               trace_rxrpc_timer_set(call, pto, rxrpc_timer_trace_rack_tlp_pto);
        }
 
        if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) {
@@ -589,12 +616,6 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se
                call->expect_rx_by = ktime_add(req->now, delay);
                trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_expect_rx);
        }
-       if (call->resend_at == KTIME_MAX) {
-               ktime_t delay = rxrpc_get_rto_backoff(call, false);
-
-               call->resend_at = ktime_add(req->now, delay);
-               trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_resend);
-       }
 
        rxrpc_set_keepalive(call, req->now);
        return len;