mptcp: put reference in mptcp timeout timer
[linux-2.6-block.git] / net / mptcp / protocol.c
CommitLineData
f870fa0b
MM
1// SPDX-License-Identifier: GPL-2.0
2/* Multipath TCP
3 *
4 * Copyright (c) 2017 - 2019, Intel Corporation.
5 */
6
7#define pr_fmt(fmt) "MPTCP: " fmt
8
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/netdevice.h>
7a6a6cbc
PA
12#include <linux/sched/signal.h>
13#include <linux/atomic.h>
f870fa0b
MM
14#include <net/sock.h>
15#include <net/inet_common.h>
16#include <net/inet_hashtables.h>
17#include <net/protocol.h>
18#include <net/tcp.h>
3721b9b6 19#include <net/tcp_states.h>
cf7da0d6
PK
20#if IS_ENABLED(CONFIG_MPTCP_IPV6)
21#include <net/transp_v6.h>
22#endif
f870fa0b 23#include <net/mptcp.h>
e16163b6 24#include <net/xfrm.h>
f870fa0b 25#include "protocol.h"
fc518953 26#include "mib.h"
f870fa0b 27
b0519de8
FW
28#if IS_ENABLED(CONFIG_MPTCP_IPV6)
29struct mptcp6_sock {
30 struct mptcp_sock msk;
31 struct ipv6_pinfo np;
32};
33#endif
34
6771bfd9 35struct mptcp_skb_cb {
ab174ad8
PA
36 u64 map_seq;
37 u64 end_seq;
6771bfd9
FW
38 u32 offset;
39};
40
41#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
42
d027236c
PA
43static struct percpu_counter mptcp_sockets_allocated;
44
e16163b6 45static void __mptcp_destroy_sock(struct sock *sk);
d9ca1de8 46static void __mptcp_check_send_data_fin(struct sock *sk);
e16163b6 47
2303f994
PK
48/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
49 * completed yet or has failed, return the subflow socket.
50 * Otherwise return NULL.
51 */
52static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
53{
d22f4988 54 if (!msk->subflow || READ_ONCE(msk->can_ack))
2303f994
PK
55 return NULL;
56
57 return msk->subflow;
58}
59
6f8a612a
FW
60/* Returns end sequence number of the receiver's advertised window */
61static u64 mptcp_wnd_end(const struct mptcp_sock *msk)
62{
63 return atomic64_read(&msk->wnd_end);
64}
65
d2f77c53 66static bool mptcp_is_tcpsk(struct sock *sk)
0b4f33de
FW
67{
68 struct socket *sock = sk->sk_socket;
69
0b4f33de
FW
70 if (unlikely(sk->sk_prot == &tcp_prot)) {
71 /* we are being invoked after mptcp_accept() has
72 * accepted a non-mp-capable flow: sk is a tcp_sk,
73 * not an mptcp one.
74 *
75 * Hand the socket over to tcp so all further socket ops
76 * bypass mptcp.
77 */
78 sock->ops = &inet_stream_ops;
d2f77c53 79 return true;
0b4f33de
FW
80#if IS_ENABLED(CONFIG_MPTCP_IPV6)
81 } else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
82 sock->ops = &inet6_stream_ops;
d2f77c53 83 return true;
0b4f33de
FW
84#endif
85 }
86
d2f77c53 87 return false;
0b4f33de
FW
88}
89
76660afb 90static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk)
cec37a6e 91{
cec37a6e
PK
92 sock_owned_by_me((const struct sock *)msk);
93
e1ff9e82 94 if (likely(!__mptcp_check_fallback(msk)))
cec37a6e
PK
95 return NULL;
96
76660afb 97 return msk->first;
cec37a6e
PK
98}
99
fa68018d 100static int __mptcp_socket_create(struct mptcp_sock *msk)
2303f994
PK
101{
102 struct mptcp_subflow_context *subflow;
103 struct sock *sk = (struct sock *)msk;
104 struct socket *ssock;
105 int err;
106
2303f994
PK
107 err = mptcp_subflow_create_socket(sk, &ssock);
108 if (err)
fa68018d 109 return err;
2303f994 110
8ab183de 111 msk->first = ssock->sk;
2303f994
PK
112 msk->subflow = ssock;
113 subflow = mptcp_subflow_ctx(ssock->sk);
cec37a6e 114 list_add(&subflow->node, &msk->conn_list);
e16163b6 115 sock_hold(ssock->sk);
2303f994
PK
116 subflow->request_mptcp = 1;
117
e1ff9e82
DC
118 /* accept() will wait on first subflow sk_wq, and we always wakes up
119 * via msk->sk_socket
120 */
121 RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq);
122
fa68018d 123 return 0;
2303f994
PK
124}
125
ab174ad8
PA
126static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
127{
128 sk_drops_add(sk, skb);
129 __kfree_skb(skb);
130}
131
8268ed4c
PA
132static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
133 struct sk_buff *from)
134{
135 bool fragstolen;
136 int delta;
137
138 if (MPTCP_SKB_CB(from)->offset ||
139 !skb_try_coalesce(to, from, &fragstolen, &delta))
140 return false;
141
06242e44
PA
142 pr_debug("colesced seq %llx into %llx new len %d new end seq %llx",
143 MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
144 to->len, MPTCP_SKB_CB(from)->end_seq);
ab174ad8 145 MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
8268ed4c
PA
146 kfree_skb_partial(from, fragstolen);
147 atomic_add(delta, &sk->sk_rmem_alloc);
148 sk_mem_charge(sk, delta);
149 return true;
150}
151
ab174ad8
PA
152static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
153 struct sk_buff *from)
154{
155 if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq)
156 return false;
157
158 return mptcp_try_coalesce((struct sock *)msk, to, from);
159}
160
161/* "inspired" by tcp_data_queue_ofo(), main differences:
162 * - use mptcp seqs
163 * - don't cope with sacks
164 */
165static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
166{
167 struct sock *sk = (struct sock *)msk;
168 struct rb_node **p, *parent;
169 u64 seq, end_seq, max_seq;
170 struct sk_buff *skb1;
171
172 seq = MPTCP_SKB_CB(skb)->map_seq;
173 end_seq = MPTCP_SKB_CB(skb)->end_seq;
fa3fe2b1 174 max_seq = READ_ONCE(msk->rcv_wnd_sent);
ab174ad8 175
06242e44
PA
176 pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq,
177 RB_EMPTY_ROOT(&msk->out_of_order_queue));
fa3fe2b1 178 if (after64(end_seq, max_seq)) {
ab174ad8
PA
179 /* out of window */
180 mptcp_drop(sk, skb);
fa3fe2b1
FW
181 pr_debug("oow by %lld, rcv_wnd_sent %llu\n",
182 (unsigned long long)end_seq - (unsigned long)max_seq,
183 (unsigned long long)msk->rcv_wnd_sent);
06242e44 184 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
ab174ad8
PA
185 return;
186 }
187
188 p = &msk->out_of_order_queue.rb_node;
06242e44 189 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE);
ab174ad8
PA
190 if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) {
191 rb_link_node(&skb->rbnode, NULL, p);
192 rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);
193 msk->ooo_last_skb = skb;
194 goto end;
195 }
196
197 /* with 2 subflows, adding at end of ooo queue is quite likely
198 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
199 */
06242e44
PA
200 if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) {
201 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
202 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
ab174ad8 203 return;
06242e44 204 }
ab174ad8
PA
205
206 /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
207 if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) {
06242e44 208 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
ab174ad8
PA
209 parent = &msk->ooo_last_skb->rbnode;
210 p = &parent->rb_right;
211 goto insert;
212 }
213
214 /* Find place to insert this segment. Handle overlaps on the way. */
215 parent = NULL;
216 while (*p) {
217 parent = *p;
218 skb1 = rb_to_skb(parent);
219 if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
220 p = &parent->rb_left;
221 continue;
222 }
223 if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) {
224 if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) {
225 /* All the bits are present. Drop. */
226 mptcp_drop(sk, skb);
06242e44 227 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
ab174ad8
PA
228 return;
229 }
230 if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
231 /* partial overlap:
232 * | skb |
233 * | skb1 |
234 * continue traversing
235 */
236 } else {
237 /* skb's seq == skb1's seq and skb covers skb1.
238 * Replace skb1 with skb.
239 */
240 rb_replace_node(&skb1->rbnode, &skb->rbnode,
241 &msk->out_of_order_queue);
242 mptcp_drop(sk, skb1);
06242e44 243 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
ab174ad8
PA
244 goto merge_right;
245 }
246 } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) {
06242e44 247 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
ab174ad8
PA
248 return;
249 }
250 p = &parent->rb_right;
251 }
06242e44 252
ab174ad8
PA
253insert:
254 /* Insert segment into RB tree. */
255 rb_link_node(&skb->rbnode, parent, p);
256 rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);
257
258merge_right:
259 /* Remove other segments covered by skb. */
260 while ((skb1 = skb_rb_next(skb)) != NULL) {
261 if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq))
262 break;
263 rb_erase(&skb1->rbnode, &msk->out_of_order_queue);
264 mptcp_drop(sk, skb1);
06242e44 265 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
ab174ad8
PA
266 }
267 /* If there is no skb after us, we are the last_skb ! */
268 if (!skb1)
269 msk->ooo_last_skb = skb;
270
271end:
272 skb_condense(skb);
273 skb_set_owner_r(skb, sk);
274}
275
276static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
277 struct sk_buff *skb, unsigned int offset,
278 size_t copy_len)
6771bfd9 279{
ab174ad8 280 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
6771bfd9 281 struct sock *sk = (struct sock *)msk;
4e637c70 282 struct sk_buff *tail;
6771bfd9
FW
283
284 __skb_unlink(skb, &ssk->sk_receive_queue);
6771bfd9 285
4e637c70
FW
286 skb_ext_reset(skb);
287 skb_orphan(skb);
ab174ad8 288
9c3f94e1
PA
289 /* try to fetch required memory from subflow */
290 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
291 if (ssk->sk_forward_alloc < skb->truesize)
292 goto drop;
293 __sk_mem_reclaim(ssk, skb->truesize);
294 if (!sk_rmem_schedule(sk, skb, skb->truesize))
295 goto drop;
296 }
297
ab174ad8
PA
298 /* the skb map_seq accounts for the skb offset:
299 * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
300 * value
301 */
302 MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow);
303 MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len;
8268ed4c 304 MPTCP_SKB_CB(skb)->offset = offset;
4e637c70 305
ab174ad8
PA
306 if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
307 /* in sequence */
8b0308fe 308 WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
ab174ad8
PA
309 tail = skb_peek_tail(&sk->sk_receive_queue);
310 if (tail && mptcp_try_coalesce(sk, tail, skb))
311 return true;
4e637c70 312
ab174ad8
PA
313 skb_set_owner_r(skb, sk);
314 __skb_queue_tail(&sk->sk_receive_queue, skb);
315 return true;
316 } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
317 mptcp_data_queue_ofo(msk, skb);
318 return false;
319 }
320
321 /* old data, keep it simple and drop the whole pkt, sender
322 * will retransmit as needed, if needed.
323 */
06242e44 324 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
9c3f94e1 325drop:
ab174ad8
PA
326 mptcp_drop(sk, skb);
327 return false;
6771bfd9
FW
328}
329
16a9a9da
MM
330static void mptcp_stop_timer(struct sock *sk)
331{
332 struct inet_connection_sock *icsk = inet_csk(sk);
333
334 sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
335 mptcp_sk(sk)->timer_ival = 0;
336}
337
e16163b6
PA
338static void mptcp_close_wake_up(struct sock *sk)
339{
340 if (sock_flag(sk, SOCK_DEAD))
341 return;
342
343 sk->sk_state_change(sk);
344 if (sk->sk_shutdown == SHUTDOWN_MASK ||
345 sk->sk_state == TCP_CLOSE)
346 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
347 else
348 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
349}
350
16a9a9da
MM
351static void mptcp_check_data_fin_ack(struct sock *sk)
352{
353 struct mptcp_sock *msk = mptcp_sk(sk);
354
355 if (__mptcp_check_fallback(msk))
356 return;
357
358 /* Look for an acknowledged DATA_FIN */
359 if (((1 << sk->sk_state) &
360 (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) &&
361 msk->write_seq == atomic64_read(&msk->snd_una)) {
362 mptcp_stop_timer(sk);
363
364 WRITE_ONCE(msk->snd_data_fin_enable, 0);
365
366 switch (sk->sk_state) {
367 case TCP_FIN_WAIT1:
368 inet_sk_state_store(sk, TCP_FIN_WAIT2);
16a9a9da
MM
369 break;
370 case TCP_CLOSING:
16a9a9da
MM
371 case TCP_LAST_ACK:
372 inet_sk_state_store(sk, TCP_CLOSE);
16a9a9da
MM
373 break;
374 }
375
e16163b6 376 mptcp_close_wake_up(sk);
16a9a9da
MM
377 }
378}
379
3721b9b6
MM
380static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq)
381{
382 struct mptcp_sock *msk = mptcp_sk(sk);
383
384 if (READ_ONCE(msk->rcv_data_fin) &&
385 ((1 << sk->sk_state) &
386 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) {
387 u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq);
388
389 if (msk->ack_seq == rcv_data_fin_seq) {
390 if (seq)
391 *seq = rcv_data_fin_seq;
392
393 return true;
394 }
395 }
396
397 return false;
398}
399
400static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
401{
402 long tout = ssk && inet_csk(ssk)->icsk_pending ?
403 inet_csk(ssk)->icsk_timeout - jiffies : 0;
404
405 if (tout <= 0)
406 tout = mptcp_sk(sk)->timer_ival;
407 mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
408}
409
ea4ca586
PA
410static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
411{
412 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
413
414 /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
415 if (subflow->request_join && !subflow->fully_established)
416 return false;
417
418 /* only send if our side has not closed yet */
419 return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
420}
421
422static void mptcp_send_ack(struct mptcp_sock *msk, bool force)
7ed90803
PA
423{
424 struct mptcp_subflow_context *subflow;
ea4ca586 425 struct sock *pick = NULL;
7ed90803
PA
426
427 mptcp_for_each_subflow(msk, subflow) {
428 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
429
ea4ca586
PA
430 if (force) {
431 lock_sock(ssk);
432 tcp_send_ack(ssk);
433 release_sock(ssk);
434 continue;
435 }
436
437 /* if the hintes ssk is still active, use it */
438 pick = ssk;
439 if (ssk == msk->ack_hint)
440 break;
441 }
442 if (!force && pick) {
443 lock_sock(pick);
444 tcp_cleanup_rbuf(pick, 1);
445 release_sock(pick);
7ed90803
PA
446 }
447}
448
449static bool mptcp_check_data_fin(struct sock *sk)
3721b9b6
MM
450{
451 struct mptcp_sock *msk = mptcp_sk(sk);
452 u64 rcv_data_fin_seq;
7ed90803 453 bool ret = false;
3721b9b6
MM
454
455 if (__mptcp_check_fallback(msk) || !msk->first)
7ed90803 456 return ret;
3721b9b6
MM
457
458 /* Need to ack a DATA_FIN received from a peer while this side
459 * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2.
460 * msk->rcv_data_fin was set when parsing the incoming options
461 * at the subflow level and the msk lock was not held, so this
462 * is the first opportunity to act on the DATA_FIN and change
463 * the msk state.
464 *
465 * If we are caught up to the sequence number of the incoming
466 * DATA_FIN, send the DATA_ACK now and do state transition. If
467 * not caught up, do nothing and let the recv code send DATA_ACK
468 * when catching up.
469 */
470
471 if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) {
917944da 472 WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1);
3721b9b6
MM
473 WRITE_ONCE(msk->rcv_data_fin, 0);
474
475 sk->sk_shutdown |= RCV_SHUTDOWN;
16a9a9da
MM
476 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
477 set_bit(MPTCP_DATA_READY, &msk->flags);
3721b9b6
MM
478
479 switch (sk->sk_state) {
480 case TCP_ESTABLISHED:
481 inet_sk_state_store(sk, TCP_CLOSE_WAIT);
482 break;
483 case TCP_FIN_WAIT1:
484 inet_sk_state_store(sk, TCP_CLOSING);
485 break;
486 case TCP_FIN_WAIT2:
487 inet_sk_state_store(sk, TCP_CLOSE);
3721b9b6
MM
488 break;
489 default:
490 /* Other states not expected */
491 WARN_ON_ONCE(1);
492 break;
493 }
494
7ed90803 495 ret = true;
3721b9b6 496 mptcp_set_timeout(sk, NULL);
ea4ca586 497 mptcp_send_ack(msk, true);
e16163b6 498 mptcp_close_wake_up(sk);
3721b9b6 499 }
7ed90803 500 return ret;
3721b9b6
MM
501}
502
6771bfd9
FW
503static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
504 struct sock *ssk,
505 unsigned int *bytes)
506{
507 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
600911ff 508 struct sock *sk = (struct sock *)msk;
6771bfd9
FW
509 unsigned int moved = 0;
510 bool more_data_avail;
511 struct tcp_sock *tp;
512 bool done = false;
13c7ba0c
FW
513 int sk_rbuf;
514
515 sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
516
517 if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
518 int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
519
520 if (unlikely(ssk_rbuf > sk_rbuf)) {
521 WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf);
522 sk_rbuf = ssk_rbuf;
523 }
524 }
600911ff 525
ab174ad8 526 pr_debug("msk=%p ssk=%p", msk, ssk);
6771bfd9
FW
527 tp = tcp_sk(ssk);
528 do {
529 u32 map_remaining, offset;
530 u32 seq = tp->copied_seq;
531 struct sk_buff *skb;
532 bool fin;
533
534 /* try to move as much data as available */
535 map_remaining = subflow->map_data_len -
536 mptcp_subflow_get_map_offset(subflow);
537
538 skb = skb_peek(&ssk->sk_receive_queue);
d9fb8c50
PA
539 if (!skb) {
540 /* if no data is found, a racing workqueue/recvmsg
541 * already processed the new data, stop here or we
542 * can enter an infinite loop
543 */
544 if (!moved)
545 done = true;
6771bfd9 546 break;
d9fb8c50 547 }
6771bfd9 548
e1ff9e82
DC
549 if (__mptcp_check_fallback(msk)) {
550 /* if we are running under the workqueue, TCP could have
551 * collapsed skbs between dummy map creation and now
552 * be sure to adjust the size
553 */
554 map_remaining = skb->len;
555 subflow->map_data_len = skb->len;
556 }
557
6771bfd9
FW
558 offset = seq - TCP_SKB_CB(skb)->seq;
559 fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
560 if (fin) {
561 done = true;
562 seq++;
563 }
564
565 if (offset < skb->len) {
566 size_t len = skb->len - offset;
567
568 if (tp->urg_data)
569 done = true;
570
ab174ad8
PA
571 if (__mptcp_move_skb(msk, ssk, skb, offset, len))
572 moved += len;
6771bfd9 573 seq += len;
6771bfd9
FW
574
575 if (WARN_ON_ONCE(map_remaining < len))
576 break;
577 } else {
578 WARN_ON_ONCE(!fin);
579 sk_eat_skb(ssk, skb);
580 done = true;
581 }
582
583 WRITE_ONCE(tp->copied_seq, seq);
584 more_data_avail = mptcp_subflow_data_available(ssk);
600911ff 585
13c7ba0c 586 if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) {
600911ff
FW
587 done = true;
588 break;
589 }
6771bfd9 590 } while (more_data_avail);
ea4ca586 591 msk->ack_hint = ssk;
6771bfd9 592
6719331c 593 *bytes += moved;
6771bfd9
FW
594 return done;
595}
596
ab174ad8
PA
597static bool mptcp_ofo_queue(struct mptcp_sock *msk)
598{
599 struct sock *sk = (struct sock *)msk;
600 struct sk_buff *skb, *tail;
601 bool moved = false;
602 struct rb_node *p;
603 u64 end_seq;
604
605 p = rb_first(&msk->out_of_order_queue);
06242e44 606 pr_debug("msk=%p empty=%d", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue));
ab174ad8
PA
607 while (p) {
608 skb = rb_to_skb(p);
609 if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq))
610 break;
611
612 p = rb_next(p);
613 rb_erase(&skb->rbnode, &msk->out_of_order_queue);
614
615 if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq,
616 msk->ack_seq))) {
617 mptcp_drop(sk, skb);
06242e44 618 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
ab174ad8
PA
619 continue;
620 }
621
622 end_seq = MPTCP_SKB_CB(skb)->end_seq;
623 tail = skb_peek_tail(&sk->sk_receive_queue);
624 if (!tail || !mptcp_ooo_try_coalesce(msk, tail, skb)) {
625 int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq;
626
627 /* skip overlapping data, if any */
06242e44
PA
628 pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d",
629 MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq,
630 delta);
ab174ad8
PA
631 MPTCP_SKB_CB(skb)->offset += delta;
632 __skb_queue_tail(&sk->sk_receive_queue, skb);
633 }
634 msk->ack_seq = end_seq;
635 moved = true;
636 }
637 return moved;
638}
639
2e52213c
FW
640/* In most cases we will be able to lock the mptcp socket. If its already
641 * owned, we need to defer to the work queue to avoid ABBA deadlock.
642 */
643static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
644{
645 struct sock *sk = (struct sock *)msk;
646 unsigned int moved = 0;
647
648 if (READ_ONCE(sk->sk_lock.owned))
649 return false;
650
651 if (unlikely(!spin_trylock_bh(&sk->sk_lock.slock)))
652 return false;
653
654 /* must re-check after taking the lock */
ab174ad8 655 if (!READ_ONCE(sk->sk_lock.owned)) {
2e52213c 656 __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
ab174ad8
PA
657 mptcp_ofo_queue(msk);
658
659 /* If the moves have caught up with the DATA_FIN sequence number
660 * it's time to ack the DATA_FIN and change socket state, but
661 * this is not a good place to change state. Let the workqueue
662 * do it.
663 */
ba8f48f7
PA
664 if (mptcp_pending_data_fin(sk, NULL))
665 mptcp_schedule_work(sk);
ab174ad8 666 }
2e52213c
FW
667
668 spin_unlock_bh(&sk->sk_lock.slock);
669
670 return moved > 0;
671}
672
673void mptcp_data_ready(struct sock *sk, struct sock *ssk)
101f6f85 674{
6719331c 675 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
101f6f85 676 struct mptcp_sock *msk = mptcp_sk(sk);
13c7ba0c 677 int sk_rbuf, ssk_rbuf;
6719331c 678 bool wake;
101f6f85 679
6719331c
PA
680 /* move_skbs_to_msk below can legitly clear the data_avail flag,
681 * but we will need later to properly woke the reader, cache its
682 * value
683 */
684 wake = subflow->data_avail == MPTCP_SUBFLOW_DATA_AVAIL;
685 if (wake)
686 set_bit(MPTCP_DATA_READY, &msk->flags);
6771bfd9 687
13c7ba0c
FW
688 ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
689 sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
690 if (unlikely(ssk_rbuf > sk_rbuf))
691 sk_rbuf = ssk_rbuf;
692
693 /* over limit? can't append more skbs to msk */
694 if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf)
2e52213c
FW
695 goto wake;
696
ea4ca586 697 move_skbs_to_msk(msk, ssk);
600911ff 698
600911ff 699wake:
6719331c
PA
700 if (wake)
701 sk->sk_data_ready(sk);
101f6f85
FW
702}
703
84dfe367 704void __mptcp_flush_join_list(struct mptcp_sock *msk)
ec3edaa7
PK
705{
706 if (likely(list_empty(&msk->join_list)))
707 return;
708
709 spin_lock_bh(&msk->join_list_lock);
710 list_splice_tail_init(&msk->join_list, &msk->conn_list);
711 spin_unlock_bh(&msk->join_list_lock);
712}
713
b51f9b80
PA
714static bool mptcp_timer_pending(struct sock *sk)
715{
716 return timer_pending(&inet_csk(sk)->icsk_retransmit_timer);
717}
718
719static void mptcp_reset_timer(struct sock *sk)
720{
721 struct inet_connection_sock *icsk = inet_csk(sk);
722 unsigned long tout;
723
e16163b6
PA
724 /* prevent rescheduling on close */
725 if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE))
726 return;
727
b51f9b80
PA
728 /* should never be called with mptcp level timer cleared */
729 tout = READ_ONCE(mptcp_sk(sk)->timer_ival);
730 if (WARN_ON_ONCE(!tout))
731 tout = TCP_RTO_MIN;
732 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout);
733}
734
ba8f48f7
PA
735bool mptcp_schedule_work(struct sock *sk)
736{
737 if (inet_sk_state_load(sk) != TCP_CLOSE &&
738 schedule_work(&mptcp_sk(sk)->work)) {
739 /* each subflow already holds a reference to the sk, and the
740 * workqueue is invoked by a subflow, so sk can't go away here.
741 */
742 sock_hold(sk);
743 return true;
744 }
745 return false;
746}
747
b51f9b80
PA
748void mptcp_data_acked(struct sock *sk)
749{
750 mptcp_reset_timer(sk);
3b1d6210 751
8edf0864 752 if ((test_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags) ||
813e0a68 753 mptcp_send_head(sk) ||
ba8f48f7
PA
754 (inet_sk_state_load(sk) != TCP_ESTABLISHED)))
755 mptcp_schedule_work(sk);
b51f9b80
PA
756}
757
59832e24
FW
758void mptcp_subflow_eof(struct sock *sk)
759{
ba8f48f7
PA
760 if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags))
761 mptcp_schedule_work(sk);
59832e24
FW
762}
763
5969856a
PA
764static void mptcp_check_for_eof(struct mptcp_sock *msk)
765{
766 struct mptcp_subflow_context *subflow;
767 struct sock *sk = (struct sock *)msk;
768 int receivers = 0;
769
770 mptcp_for_each_subflow(msk, subflow)
771 receivers += !subflow->rx_eof;
e16163b6
PA
772 if (receivers)
773 return;
5969856a 774
e16163b6 775 if (!(sk->sk_shutdown & RCV_SHUTDOWN)) {
5969856a
PA
776 /* hopefully temporary hack: propagate shutdown status
777 * to msk, when all subflows agree on it
778 */
779 sk->sk_shutdown |= RCV_SHUTDOWN;
780
781 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
782 set_bit(MPTCP_DATA_READY, &msk->flags);
783 sk->sk_data_ready(sk);
784 }
e16163b6
PA
785
786 switch (sk->sk_state) {
787 case TCP_ESTABLISHED:
788 inet_sk_state_store(sk, TCP_CLOSE_WAIT);
789 break;
790 case TCP_FIN_WAIT1:
26aa2314
PA
791 inet_sk_state_store(sk, TCP_CLOSING);
792 break;
793 case TCP_FIN_WAIT2:
e16163b6
PA
794 inet_sk_state_store(sk, TCP_CLOSE);
795 break;
796 default:
797 return;
798 }
799 mptcp_close_wake_up(sk);
5969856a
PA
800}
801
6d0060f6
MM
802static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
803{
4930f483
FW
804 const struct sock *sk = (const struct sock *)msk;
805
6d0060f6 806 if (!msk->cached_ext)
4930f483 807 msk->cached_ext = __skb_ext_alloc(sk->sk_allocation);
6d0060f6
MM
808
809 return !!msk->cached_ext;
810}
811
7a6a6cbc
PA
812static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
813{
814 struct mptcp_subflow_context *subflow;
815 struct sock *sk = (struct sock *)msk;
816
817 sock_owned_by_me(sk);
818
819 mptcp_for_each_subflow(msk, subflow) {
820 if (subflow->data_avail)
821 return mptcp_subflow_tcp_sock(subflow);
822 }
823
824 return NULL;
825}
826
3f8e0aae
PA
827static bool mptcp_skb_can_collapse_to(u64 write_seq,
828 const struct sk_buff *skb,
829 const struct mptcp_ext *mpext)
57040755
PA
830{
831 if (!tcp_skb_can_collapse_to(skb))
832 return false;
833
5a369ca6
PA
834 /* can collapse only if MPTCP level sequence is in order and this
835 * mapping has not been xmitted yet
836 */
837 return mpext && mpext->data_seq + mpext->data_len == write_seq &&
838 !mpext->frozen;
57040755
PA
839}
840
18b683bf
PA
841static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
842 const struct page_frag *pfrag,
843 const struct mptcp_data_frag *df)
844{
845 return df && pfrag->page == df->page &&
d9ca1de8 846 pfrag->size - pfrag->offset > 0 &&
18b683bf
PA
847 df->data_seq + df->data_len == msk->write_seq;
848}
849
d027236c
PA
850static void dfrag_uncharge(struct sock *sk, int len)
851{
852 sk_mem_uncharge(sk, len);
7948f6cc 853 sk_wmem_queued_add(sk, -len);
d027236c
PA
854}
855
856static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
18b683bf 857{
d027236c
PA
858 int len = dfrag->data_len + dfrag->overhead;
859
18b683bf 860 list_del(&dfrag->list);
d027236c 861 dfrag_uncharge(sk, len);
18b683bf
PA
862 put_page(dfrag->page);
863}
864
865static void mptcp_clean_una(struct sock *sk)
866{
867 struct mptcp_sock *msk = mptcp_sk(sk);
868 struct mptcp_data_frag *dtmp, *dfrag;
d027236c 869 bool cleaned = false;
e1ff9e82
DC
870 u64 snd_una;
871
872 /* on fallback we just need to ignore snd_una, as this is really
873 * plain TCP
874 */
875 if (__mptcp_check_fallback(msk))
eaa2ffab 876 atomic64_set(&msk->snd_una, msk->snd_nxt);
6f8a612a 877
e1ff9e82 878 snd_una = atomic64_read(&msk->snd_una);
18b683bf
PA
879
880 list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
881 if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
882 break;
883
d9ca1de8
PA
884 if (WARN_ON_ONCE(dfrag == msk->first_pending))
885 break;
d027236c
PA
886 dfrag_clear(sk, dfrag);
887 cleaned = true;
888 }
889
7948f6cc
FW
890 dfrag = mptcp_rtx_head(sk);
891 if (dfrag && after64(snd_una, dfrag->data_seq)) {
53eb4c38
PA
892 u64 delta = snd_una - dfrag->data_seq;
893
d9ca1de8 894 if (WARN_ON_ONCE(delta > dfrag->already_sent))
53eb4c38 895 goto out;
7948f6cc
FW
896
897 dfrag->data_seq += delta;
53eb4c38 898 dfrag->offset += delta;
7948f6cc 899 dfrag->data_len -= delta;
d9ca1de8 900 dfrag->already_sent -= delta;
7948f6cc
FW
901
902 dfrag_uncharge(sk, delta);
903 cleaned = true;
904 }
905
53eb4c38 906out:
95ed690e 907 if (cleaned)
d027236c 908 sk_mem_reclaim_partial(sk);
95ed690e 909}
7948f6cc 910
95ed690e
FW
911static void mptcp_clean_una_wakeup(struct sock *sk)
912{
913 struct mptcp_sock *msk = mptcp_sk(sk);
63561a40 914
95ed690e
FW
915 mptcp_clean_una(sk);
916
917 /* Only wake up writers if a subflow is ready */
8edf0864
FW
918 if (sk_stream_is_writeable(sk)) {
919 clear_bit(MPTCP_NOSPACE, &msk->flags);
95ed690e 920 sk_stream_write_space(sk);
18b683bf
PA
921 }
922}
923
924/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
925 * data
926 */
927static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
928{
d9ca1de8
PA
929 struct mptcp_subflow_context *subflow;
930 struct mptcp_sock *msk = mptcp_sk(sk);
931 bool first = true;
932
18b683bf
PA
933 if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
934 pfrag, sk->sk_allocation)))
935 return true;
936
18b683bf 937 sk_stream_moderate_sndbuf(sk);
d9ca1de8
PA
938 mptcp_for_each_subflow(msk, subflow) {
939 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
940
941 if (first)
942 tcp_enter_memory_pressure(ssk);
943 sk_stream_moderate_sndbuf(ssk);
944 first = false;
945 }
18b683bf
PA
946 return false;
947}
948
949static struct mptcp_data_frag *
950mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
951 int orig_offset)
952{
953 int offset = ALIGN(orig_offset, sizeof(long));
954 struct mptcp_data_frag *dfrag;
955
956 dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset);
957 dfrag->data_len = 0;
958 dfrag->data_seq = msk->write_seq;
959 dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag);
960 dfrag->offset = offset + sizeof(struct mptcp_data_frag);
d9ca1de8 961 dfrag->already_sent = 0;
18b683bf
PA
962 dfrag->page = pfrag->page;
963
964 return dfrag;
965}
966
caf971df
PA
967struct mptcp_sendmsg_info {
968 int mss_now;
969 int size_goal;
d9ca1de8
PA
970 u16 limit;
971 u16 sent;
972 unsigned int flags;
caf971df
PA
973};
974
6f8a612a
FW
975static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq,
976 int avail_size)
977{
978 u64 window_end = mptcp_wnd_end(msk);
979
980 if (__mptcp_check_fallback(msk))
981 return avail_size;
982
983 if (!before64(data_seq + avail_size, window_end)) {
984 u64 allowed_size = window_end - data_seq;
985
986 return min_t(unsigned int, allowed_size, avail_size);
987 }
988
989 return avail_size;
990}
991
6d0060f6 992static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
d9ca1de8 993 struct mptcp_data_frag *dfrag,
caf971df 994 struct mptcp_sendmsg_info *info)
6d0060f6 995{
d9ca1de8 996 u64 data_seq = dfrag->data_seq + info->sent;
6d0060f6 997 struct mptcp_sock *msk = mptcp_sk(sk);
6f8a612a 998 bool zero_window_probe = false;
6d0060f6 999 struct mptcp_ext *mpext = NULL;
57040755 1000 struct sk_buff *skb, *tail;
d9ca1de8
PA
1001 bool can_collapse = false;
1002 int avail_size;
1003 size_t ret;
6d0060f6 1004
d9ca1de8
PA
1005 pr_debug("msk=%p ssk=%p sending dfrag at seq=%lld len=%d already sent=%d",
1006 msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent);
1007
1008 /* compute send limit */
1009 info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags);
caf971df 1010 avail_size = info->size_goal;
57040755
PA
1011 skb = tcp_write_queue_tail(ssk);
1012 if (skb) {
57040755
PA
1013 /* Limit the write to the size available in the
1014 * current skb, if any, so that we create at most a new skb.
1015 * Explicitly tells TCP internals to avoid collapsing on later
1016 * queue management operation, to avoid breaking the ext <->
1017 * SSN association set here
1018 */
d9ca1de8 1019 mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
caf971df 1020 can_collapse = (info->size_goal - skb->len > 0) &&
d9ca1de8 1021 mptcp_skb_can_collapse_to(data_seq, skb, mpext);
57040755
PA
1022 if (!can_collapse)
1023 TCP_SKB_CB(skb)->eor = 1;
1024 else
caf971df 1025 avail_size = info->size_goal - skb->len;
57040755 1026 }
18b683bf 1027
6f8a612a
FW
1028 /* Zero window and all data acked? Probe. */
1029 avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size);
1030 if (avail_size == 0) {
1031 if (skb || atomic64_read(&msk->snd_una) != msk->snd_nxt)
1032 return 0;
1033 zero_window_probe = true;
1034 data_seq = atomic64_read(&msk->snd_una) - 1;
1035 avail_size = 1;
1036 }
1037
d9ca1de8
PA
1038 if (WARN_ON_ONCE(info->sent > info->limit ||
1039 info->limit > dfrag->data_len))
1040 return 0;
d027236c 1041
d9ca1de8
PA
1042 ret = info->limit - info->sent;
1043 tail = tcp_build_frag(ssk, avail_size, info->flags, dfrag->page,
1044 dfrag->offset + info->sent, &ret);
e2223995
PA
1045 if (!tail) {
1046 tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk));
1047 return -ENOMEM;
35759383 1048 }
18b683bf 1049
e2223995 1050 /* if the tail skb is still the cached one, collapsing really happened.
57040755 1051 */
e2223995 1052 if (skb == tail) {
57040755
PA
1053 WARN_ON_ONCE(!can_collapse);
1054 mpext->data_len += ret;
6f8a612a 1055 WARN_ON_ONCE(zero_window_probe);
57040755
PA
1056 goto out;
1057 }
1058
e2223995 1059 mpext = __skb_ext_set(tail, SKB_EXT_MPTCP, msk->cached_ext);
6d0060f6
MM
1060 msk->cached_ext = NULL;
1061
1062 memset(mpext, 0, sizeof(*mpext));
d9ca1de8 1063 mpext->data_seq = data_seq;
6d0060f6
MM
1064 mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
1065 mpext->data_len = ret;
1066 mpext->use_map = 1;
1067 mpext->dsn64 = 1;
1068
1069 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d",
1070 mpext->data_seq, mpext->subflow_seq, mpext->data_len,
1071 mpext->dsn64);
1072
6f8a612a
FW
1073 if (zero_window_probe) {
1074 mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
1075 mpext->frozen = 1;
1076 ret = 0;
1077 tcp_push_pending_frames(ssk);
1078 }
57040755 1079out:
6d0060f6 1080 mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
6d0060f6
MM
1081 return ret;
1082}
1083
63561a40 1084static void mptcp_nospace(struct mptcp_sock *msk)
a0e17064 1085{
63561a40
PA
1086 struct mptcp_subflow_context *subflow;
1087
8edf0864 1088 set_bit(MPTCP_NOSPACE, &msk->flags);
a0e17064
FW
1089 smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
1090
63561a40
PA
1091 mptcp_for_each_subflow(msk, subflow) {
1092 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
8edf0864 1093 bool ssk_writeable = sk_stream_is_writeable(ssk);
63561a40
PA
1094 struct socket *sock = READ_ONCE(ssk->sk_socket);
1095
8edf0864
FW
1096 if (ssk_writeable || !sock)
1097 continue;
1098
63561a40 1099 /* enables ssk->write_space() callbacks */
8edf0864 1100 set_bit(SOCK_NOSPACE, &sock->flags);
63561a40 1101 }
8edf0864
FW
1102
1103 /* mptcp_data_acked() could run just before we set the NOSPACE bit,
1104 * so explicitly check for snd_una value
1105 */
1106 mptcp_clean_una((struct sock *)msk);
a0e17064
FW
1107}
1108
d5f49190
PA
1109#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
1110 sizeof(struct tcphdr) - \
1111 MAX_TCP_OPTION_SPACE - \
1112 sizeof(struct ipv6hdr) - \
1113 sizeof(struct frag_hdr))
1114
1115struct subflow_send_info {
1116 struct sock *ssk;
1117 u64 ratio;
1118};
1119
da51aef5
PA
1120static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
1121 u32 *sndbuf)
f296234c 1122{
d5f49190 1123 struct subflow_send_info send_info[2];
f296234c 1124 struct mptcp_subflow_context *subflow;
d5f49190
PA
1125 int i, nr_active = 0;
1126 struct sock *ssk;
1127 u64 ratio;
1128 u32 pace;
f296234c 1129
d5f49190 1130 sock_owned_by_me((struct sock *)msk);
f296234c 1131
da51aef5 1132 *sndbuf = 0;
149f7c71
FW
1133 if (!mptcp_ext_cache_refill(msk))
1134 return NULL;
1135
d5f49190
PA
1136 if (__mptcp_check_fallback(msk)) {
1137 if (!msk->first)
f296234c 1138 return NULL;
d5f49190
PA
1139 *sndbuf = msk->first->sk_sndbuf;
1140 return sk_stream_memory_free(msk->first) ? msk->first : NULL;
1141 }
1142
1143 /* re-use last subflow, if the burst allow that */
1144 if (msk->last_snd && msk->snd_burst > 0 &&
1145 sk_stream_memory_free(msk->last_snd) &&
1146 mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
1147 mptcp_for_each_subflow(msk, subflow) {
1148 ssk = mptcp_subflow_tcp_sock(subflow);
1149 *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
f296234c 1150 }
d5f49190
PA
1151 return msk->last_snd;
1152 }
f296234c 1153
d5f49190
PA
1154 /* pick the subflow with the lower wmem/wspace ratio */
1155 for (i = 0; i < 2; ++i) {
1156 send_info[i].ssk = NULL;
1157 send_info[i].ratio = -1;
1158 }
1159 mptcp_for_each_subflow(msk, subflow) {
1160 ssk = mptcp_subflow_tcp_sock(subflow);
1161 if (!mptcp_subflow_active(subflow))
1162 continue;
1163
1164 nr_active += !subflow->backup;
da51aef5 1165 *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
d5f49190
PA
1166 if (!sk_stream_memory_free(subflow->tcp_sock))
1167 continue;
f296234c 1168
d5f49190
PA
1169 pace = READ_ONCE(ssk->sk_pacing_rate);
1170 if (!pace)
f296234c 1171 continue;
f296234c 1172
d5f49190
PA
1173 ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32,
1174 pace);
1175 if (ratio < send_info[subflow->backup].ratio) {
1176 send_info[subflow->backup].ssk = ssk;
1177 send_info[subflow->backup].ratio = ratio;
1178 }
f296234c
PK
1179 }
1180
d5f49190
PA
1181 pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld",
1182 msk, nr_active, send_info[0].ssk, send_info[0].ratio,
1183 send_info[1].ssk, send_info[1].ratio);
1184
1185 /* pick the best backup if no other subflow is active */
1186 if (!nr_active)
1187 send_info[0].ssk = send_info[1].ssk;
1188
1189 if (send_info[0].ssk) {
1190 msk->last_snd = send_info[0].ssk;
1191 msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
1192 sk_stream_wspace(msk->last_snd));
1193 return msk->last_snd;
1194 }
1195 return NULL;
f296234c
PK
1196}
1197
d9ca1de8
PA
1198static void mptcp_push_release(struct sock *sk, struct sock *ssk,
1199 struct mptcp_sendmsg_info *info)
1200{
1201 mptcp_set_timeout(sk, ssk);
1202 tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal);
1203 release_sock(ssk);
1204}
1205
1206static void mptcp_push_pending(struct sock *sk, unsigned int flags)
f870fa0b 1207{
d9ca1de8 1208 struct sock *prev_ssk = NULL, *ssk = NULL;
f870fa0b 1209 struct mptcp_sock *msk = mptcp_sk(sk);
caf971df 1210 struct mptcp_sendmsg_info info = {
d9ca1de8 1211 .flags = flags,
caf971df 1212 };
d9ca1de8
PA
1213 struct mptcp_data_frag *dfrag;
1214 int len, copied = 0;
1215 u32 sndbuf;
1216
1217 while ((dfrag = mptcp_send_head(sk))) {
1218 info.sent = dfrag->already_sent;
1219 info.limit = dfrag->data_len;
1220 len = dfrag->data_len - dfrag->already_sent;
1221 while (len > 0) {
1222 int ret = 0;
1223
1224 prev_ssk = ssk;
1225 __mptcp_flush_join_list(msk);
1226 ssk = mptcp_subflow_get_send(msk, &sndbuf);
1227
1228 /* do auto tuning */
1229 if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
1230 sndbuf > READ_ONCE(sk->sk_sndbuf))
1231 WRITE_ONCE(sk->sk_sndbuf, sndbuf);
1232
1233 /* try to keep the subflow socket lock across
1234 * consecutive xmit on the same socket
1235 */
1236 if (ssk != prev_ssk && prev_ssk)
1237 mptcp_push_release(sk, prev_ssk, &info);
1238 if (!ssk)
1239 goto out;
1240
1241 if (ssk != prev_ssk || !prev_ssk)
1242 lock_sock(ssk);
1243
1244 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
1245 if (ret <= 0) {
1246 mptcp_push_release(sk, ssk, &info);
1247 goto out;
1248 }
1249
1250 info.sent += ret;
1251 dfrag->already_sent += ret;
1252 msk->snd_nxt += ret;
1253 msk->snd_burst -= ret;
1254 copied += ret;
1255 len -= ret;
1256 }
1257 WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
1258 }
1259
1260 /* at this point we held the socket lock for the last subflow we used */
1261 if (ssk)
1262 mptcp_push_release(sk, ssk, &info);
1263
1264out:
b680a214
PA
1265 if (copied) {
1266 /* start the timer, if it's not pending */
1267 if (!mptcp_timer_pending(sk))
1268 mptcp_reset_timer(sk);
d9ca1de8 1269 __mptcp_check_send_data_fin(sk);
b680a214 1270 }
d9ca1de8
PA
1271}
1272
1273static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
1274{
1275 struct mptcp_sock *msk = mptcp_sk(sk);
17091708 1276 struct page_frag *pfrag;
6d0060f6 1277 size_t copied = 0;
caf971df 1278 int ret = 0;
6d0060f6 1279 long timeo;
f870fa0b
MM
1280
1281 if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
1282 return -EOPNOTSUPP;
1283
cec37a6e 1284 lock_sock(sk);
1954b860
MM
1285
1286 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1287
1288 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
1289 ret = sk_stream_wait_connect(sk, &timeo);
1290 if (ret)
1291 goto out;
1292 }
1293
17091708 1294 pfrag = sk_page_frag(sk);
18b683bf
PA
1295 mptcp_clean_una(sk);
1296
d9ca1de8
PA
1297 while (msg_data_left(msg)) {
1298 struct mptcp_data_frag *dfrag;
1299 int frag_truesize = 0;
1300 bool dfrag_collapsed;
1301 size_t psize, offset;
18b683bf 1302
d9ca1de8
PA
1303 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
1304 ret = -EPIPE;
f296234c
PK
1305 goto out;
1306 }
da51aef5 1307
d9ca1de8
PA
1308 /* reuse tail pfrag, if possible, or carve a new one from the
1309 * page allocator
1310 */
1311 dfrag = mptcp_pending_tail(sk);
1312 dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
1313 if (!dfrag_collapsed) {
1314 if (!sk_stream_memory_free(sk)) {
1315 mptcp_push_pending(sk, msg->msg_flags);
1316 if (!sk_stream_memory_free(sk))
1317 goto wait_for_memory;
72511aab 1318 }
d9ca1de8
PA
1319 if (!mptcp_page_frag_refill(sk, pfrag))
1320 goto wait_for_memory;
1321
1322 dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset);
1323 frag_truesize = dfrag->overhead;
72511aab 1324 }
6d0060f6 1325
d9ca1de8
PA
1326 /* we do not bound vs wspace, to allow a single packet.
1327 * memory accounting will prevent execessive memory usage
1328 * anyway
d5f49190 1329 */
d9ca1de8
PA
1330 offset = dfrag->offset + dfrag->data_len;
1331 psize = pfrag->size - offset;
1332 psize = min_t(size_t, psize, msg_data_left(msg));
1333 if (!sk_wmem_schedule(sk, psize + frag_truesize))
1334 goto wait_for_memory;
1335
1336 if (copy_page_from_iter(dfrag->page, offset, psize,
1337 &msg->msg_iter) != psize) {
1338 ret = -EFAULT;
1339 goto out;
72511aab
FW
1340 }
1341
d9ca1de8
PA
1342 /* data successfully copied into the write queue */
1343 copied += psize;
1344 dfrag->data_len += psize;
1345 frag_truesize += psize;
1346 pfrag->offset += frag_truesize;
1347 WRITE_ONCE(msk->write_seq, msk->write_seq + psize);
1348
1349 /* charge data on mptcp pending queue to the msk socket
1350 * Note: we charge such data both to sk and ssk
fb529e62 1351 */
d9ca1de8
PA
1352 sk_wmem_queued_add(sk, frag_truesize);
1353 sk->sk_forward_alloc -= frag_truesize;
1354 if (!dfrag_collapsed) {
1355 get_page(dfrag->page);
1356 list_add_tail(&dfrag->list, &msk->rtx_queue);
1357 if (!msk->first_pending)
1358 WRITE_ONCE(msk->first_pending, dfrag);
fb529e62 1359 }
d9ca1de8
PA
1360 pr_debug("msk=%p dfrag at seq=%lld len=%d sent=%d new=%d", msk,
1361 dfrag->data_seq, dfrag->data_len, dfrag->already_sent,
1362 !dfrag_collapsed);
6d0060f6 1363
d9ca1de8
PA
1364 if (!mptcp_ext_cache_refill(msk))
1365 goto wait_for_memory;
1366 continue;
b51f9b80 1367
d9ca1de8
PA
1368wait_for_memory:
1369 mptcp_nospace(msk);
d9ca1de8 1370 if (mptcp_timer_pending(sk))
b51f9b80 1371 mptcp_reset_timer(sk);
d9ca1de8
PA
1372 ret = sk_stream_wait_memory(sk, &timeo);
1373 if (ret)
1374 goto out;
57040755 1375 }
6d0060f6 1376
d9ca1de8
PA
1377 if (copied)
1378 mptcp_push_pending(sk, msg->msg_flags);
1379
1954b860 1380out:
cec37a6e 1381 release_sock(sk);
8555c6bf 1382 return copied ? : ret;
f870fa0b
MM
1383}
1384
7a6a6cbc
PA
1385static void mptcp_wait_data(struct sock *sk, long *timeo)
1386{
1387 DEFINE_WAIT_FUNC(wait, woken_wake_function);
1388 struct mptcp_sock *msk = mptcp_sk(sk);
1389
1390 add_wait_queue(sk_sleep(sk), &wait);
1391 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
1392
1393 sk_wait_event(sk, timeo,
1394 test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait);
1395
1396 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
1397 remove_wait_queue(sk_sleep(sk), &wait);
1398}
1399
6771bfd9
FW
1400static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
1401 struct msghdr *msg,
1402 size_t len)
1403{
1404 struct sock *sk = (struct sock *)msk;
1405 struct sk_buff *skb;
1406 int copied = 0;
1407
1408 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1409 u32 offset = MPTCP_SKB_CB(skb)->offset;
1410 u32 data_len = skb->len - offset;
1411 u32 count = min_t(size_t, len - copied, data_len);
1412 int err;
1413
1414 err = skb_copy_datagram_msg(skb, offset, msg, count);
1415 if (unlikely(err < 0)) {
1416 if (!copied)
1417 return err;
1418 break;
1419 }
1420
1421 copied += count;
1422
1423 if (count < data_len) {
1424 MPTCP_SKB_CB(skb)->offset += count;
1425 break;
1426 }
1427
1428 __skb_unlink(skb, &sk->sk_receive_queue);
1429 __kfree_skb(skb);
1430
1431 if (copied >= len)
1432 break;
1433 }
1434
1435 return copied;
1436}
1437
a6b118fe
FW
1438/* receive buffer autotuning. See tcp_rcv_space_adjust for more information.
1439 *
1440 * Only difference: Use highest rtt estimate of the subflows in use.
1441 */
1442static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
1443{
1444 struct mptcp_subflow_context *subflow;
1445 struct sock *sk = (struct sock *)msk;
1446 u32 time, advmss = 1;
1447 u64 rtt_us, mstamp;
1448
1449 sock_owned_by_me(sk);
1450
1451 if (copied <= 0)
1452 return;
1453
1454 msk->rcvq_space.copied += copied;
1455
1456 mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC);
1457 time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time);
1458
1459 rtt_us = msk->rcvq_space.rtt_us;
1460 if (rtt_us && time < (rtt_us >> 3))
1461 return;
1462
1463 rtt_us = 0;
1464 mptcp_for_each_subflow(msk, subflow) {
1465 const struct tcp_sock *tp;
1466 u64 sf_rtt_us;
1467 u32 sf_advmss;
1468
1469 tp = tcp_sk(mptcp_subflow_tcp_sock(subflow));
1470
1471 sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us);
1472 sf_advmss = READ_ONCE(tp->advmss);
1473
1474 rtt_us = max(sf_rtt_us, rtt_us);
1475 advmss = max(sf_advmss, advmss);
1476 }
1477
1478 msk->rcvq_space.rtt_us = rtt_us;
1479 if (time < (rtt_us >> 3) || rtt_us == 0)
1480 return;
1481
1482 if (msk->rcvq_space.copied <= msk->rcvq_space.space)
1483 goto new_measure;
1484
1485 if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
1486 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
1487 int rcvmem, rcvbuf;
1488 u64 rcvwin, grow;
1489
1490 rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
1491
1492 grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space);
1493
1494 do_div(grow, msk->rcvq_space.space);
1495 rcvwin += (grow << 1);
1496
1497 rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER);
1498 while (tcp_win_from_space(sk, rcvmem) < advmss)
1499 rcvmem += 128;
1500
1501 do_div(rcvwin, advmss);
1502 rcvbuf = min_t(u64, rcvwin * rcvmem,
1503 sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
1504
1505 if (rcvbuf > sk->sk_rcvbuf) {
1506 u32 window_clamp;
1507
1508 window_clamp = tcp_win_from_space(sk, rcvbuf);
1509 WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
1510
1511 /* Make subflows follow along. If we do not do this, we
1512 * get drops at subflow level if skbs can't be moved to
1513 * the mptcp rx queue fast enough (announced rcv_win can
1514 * exceed ssk->sk_rcvbuf).
1515 */
1516 mptcp_for_each_subflow(msk, subflow) {
1517 struct sock *ssk;
c76c6956 1518 bool slow;
a6b118fe
FW
1519
1520 ssk = mptcp_subflow_tcp_sock(subflow);
c76c6956 1521 slow = lock_sock_fast(ssk);
a6b118fe
FW
1522 WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
1523 tcp_sk(ssk)->window_clamp = window_clamp;
c76c6956
PA
1524 tcp_cleanup_rbuf(ssk, 1);
1525 unlock_sock_fast(ssk, slow);
a6b118fe
FW
1526 }
1527 }
1528 }
1529
1530 msk->rcvq_space.space = msk->rcvq_space.copied;
1531new_measure:
1532 msk->rcvq_space.copied = 0;
1533 msk->rcvq_space.time = mstamp;
1534}
1535
ea4ca586 1536static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv)
6771bfd9
FW
1537{
1538 unsigned int moved = 0;
1539 bool done;
1540
d5f49190
PA
1541 /* avoid looping forever below on racing close */
1542 if (((struct sock *)msk)->sk_state == TCP_CLOSE)
1543 return false;
1544
1545 __mptcp_flush_join_list(msk);
6771bfd9
FW
1546 do {
1547 struct sock *ssk = mptcp_subflow_recv_lookup(msk);
65f49fe7 1548 bool slowpath;
6771bfd9
FW
1549
1550 if (!ssk)
1551 break;
1552
65f49fe7 1553 slowpath = lock_sock_fast(ssk);
6771bfd9 1554 done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
ea4ca586
PA
1555 if (moved && rcv) {
1556 WRITE_ONCE(msk->rmem_pending, min(rcv, moved));
1557 tcp_cleanup_rbuf(ssk, 1);
1558 WRITE_ONCE(msk->rmem_pending, 0);
1559 }
65f49fe7 1560 unlock_sock_fast(ssk, slowpath);
6771bfd9
FW
1561 } while (!done);
1562
ab174ad8 1563 if (mptcp_ofo_queue(msk) || moved > 0) {
ea4ca586 1564 mptcp_check_data_fin((struct sock *)msk);
ab174ad8
PA
1565 return true;
1566 }
1567 return false;
6771bfd9
FW
1568}
1569
f870fa0b
MM
1570static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
1571 int nonblock, int flags, int *addr_len)
1572{
1573 struct mptcp_sock *msk = mptcp_sk(sk);
cec37a6e 1574 int copied = 0;
7a6a6cbc
PA
1575 int target;
1576 long timeo;
f870fa0b
MM
1577
1578 if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT))
1579 return -EOPNOTSUPP;
1580
cec37a6e 1581 lock_sock(sk);
7a6a6cbc
PA
1582 timeo = sock_rcvtimeo(sk, nonblock);
1583
1584 len = min_t(size_t, len, INT_MAX);
1585 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
ec3edaa7 1586 __mptcp_flush_join_list(msk);
7a6a6cbc 1587
ea4ca586
PA
1588 for (;;) {
1589 int bytes_read, old_space;
7a6a6cbc 1590
6771bfd9
FW
1591 bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
1592 if (unlikely(bytes_read < 0)) {
1593 if (!copied)
1594 copied = bytes_read;
1595 goto out_err;
1596 }
7a6a6cbc 1597
6771bfd9 1598 copied += bytes_read;
7a6a6cbc 1599
6771bfd9 1600 if (skb_queue_empty(&sk->sk_receive_queue) &&
ea4ca586 1601 __mptcp_move_skbs(msk, len - copied))
6771bfd9 1602 continue;
7a6a6cbc 1603
ea4ca586
PA
1604 /* be sure to advertise window change */
1605 old_space = READ_ONCE(msk->old_wspace);
1606 if ((tcp_space(sk) - old_space) >= old_space)
1607 mptcp_send_ack(msk, false);
1608
7a6a6cbc
PA
1609 /* only the master socket status is relevant here. The exit
1610 * conditions mirror closely tcp_recvmsg()
1611 */
1612 if (copied >= target)
1613 break;
1614
1615 if (copied) {
1616 if (sk->sk_err ||
1617 sk->sk_state == TCP_CLOSE ||
1618 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1619 !timeo ||
1620 signal_pending(current))
1621 break;
1622 } else {
1623 if (sk->sk_err) {
1624 copied = sock_error(sk);
1625 break;
1626 }
1627
5969856a
PA
1628 if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
1629 mptcp_check_for_eof(msk);
1630
7a6a6cbc
PA
1631 if (sk->sk_shutdown & RCV_SHUTDOWN)
1632 break;
1633
1634 if (sk->sk_state == TCP_CLOSE) {
1635 copied = -ENOTCONN;
1636 break;
1637 }
1638
1639 if (!timeo) {
1640 copied = -EAGAIN;
1641 break;
1642 }
1643
1644 if (signal_pending(current)) {
1645 copied = sock_intr_errno(timeo);
1646 break;
1647 }
1648 }
1649
1650 pr_debug("block timeout %ld", timeo);
7a6a6cbc 1651 mptcp_wait_data(sk, &timeo);
cec37a6e
PK
1652 }
1653
6771bfd9
FW
1654 if (skb_queue_empty(&sk->sk_receive_queue)) {
1655 /* entire backlog drained, clear DATA_READY. */
7a6a6cbc 1656 clear_bit(MPTCP_DATA_READY, &msk->flags);
cec37a6e 1657
6771bfd9
FW
1658 /* .. race-breaker: ssk might have gotten new data
1659 * after last __mptcp_move_skbs() returned false.
7a6a6cbc 1660 */
ea4ca586 1661 if (unlikely(__mptcp_move_skbs(msk, 0)))
7a6a6cbc 1662 set_bit(MPTCP_DATA_READY, &msk->flags);
6771bfd9
FW
1663 } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) {
1664 /* data to read but mptcp_wait_data() cleared DATA_READY */
1665 set_bit(MPTCP_DATA_READY, &msk->flags);
7a6a6cbc 1666 }
6771bfd9 1667out_err:
6719331c
PA
1668 pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d",
1669 msk, test_bit(MPTCP_DATA_READY, &msk->flags),
1670 skb_queue_empty(&sk->sk_receive_queue), copied);
a6b118fe
FW
1671 mptcp_rcv_space_adjust(msk, copied);
1672
7a6a6cbc 1673 release_sock(sk);
cec37a6e
PK
1674 return copied;
1675}
1676
b51f9b80
PA
1677static void mptcp_retransmit_handler(struct sock *sk)
1678{
1679 struct mptcp_sock *msk = mptcp_sk(sk);
1680
eaa2ffab 1681 if (atomic64_read(&msk->snd_una) == READ_ONCE(msk->snd_nxt)) {
b51f9b80 1682 mptcp_stop_timer(sk);
3b1d6210
PA
1683 } else {
1684 set_bit(MPTCP_WORK_RTX, &msk->flags);
ba8f48f7 1685 mptcp_schedule_work(sk);
3b1d6210 1686 }
b51f9b80
PA
1687}
1688
1689static void mptcp_retransmit_timer(struct timer_list *t)
1690{
1691 struct inet_connection_sock *icsk = from_timer(icsk, t,
1692 icsk_retransmit_timer);
1693 struct sock *sk = &icsk->icsk_inet.sk;
1694
1695 bh_lock_sock(sk);
1696 if (!sock_owned_by_user(sk)) {
1697 mptcp_retransmit_handler(sk);
1698 } else {
1699 /* delegate our work to tcp_release_cb() */
1700 if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED,
1701 &sk->sk_tsq_flags))
1702 sock_hold(sk);
1703 }
1704 bh_unlock_sock(sk);
1705 sock_put(sk);
1706}
1707
e16163b6
PA
1708static void mptcp_timeout_timer(struct timer_list *t)
1709{
1710 struct sock *sk = from_timer(sk, t, sk_timer);
1711
1712 mptcp_schedule_work(sk);
b6d69fc8 1713 sock_put(sk);
e16163b6
PA
1714}
1715
3b1d6210
PA
1716/* Find an idle subflow. Return NULL if there is unacked data at tcp
1717 * level.
1718 *
1719 * A backup subflow is returned only if that is the only kind available.
1720 */
1721static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
1722{
1723 struct mptcp_subflow_context *subflow;
1724 struct sock *backup = NULL;
1725
1726 sock_owned_by_me((const struct sock *)msk);
1727
d5f49190 1728 if (__mptcp_check_fallback(msk))
d9ca1de8 1729 return NULL;
d5f49190 1730
3b1d6210
PA
1731 mptcp_for_each_subflow(msk, subflow) {
1732 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1733
d5f49190
PA
1734 if (!mptcp_subflow_active(subflow))
1735 continue;
1736
3b1d6210 1737 /* still data outstanding at TCP level? Don't retransmit. */
860975c6
FW
1738 if (!tcp_write_queue_empty(ssk)) {
1739 if (inet_csk(ssk)->icsk_ca_state >= TCP_CA_Loss)
1740 continue;
3b1d6210 1741 return NULL;
860975c6 1742 }
3b1d6210
PA
1743
1744 if (subflow->backup) {
1745 if (!backup)
1746 backup = ssk;
1747 continue;
1748 }
1749
1750 return ssk;
1751 }
1752
1753 return backup;
1754}
1755
cec37a6e
PK
1756/* subflow sockets can be either outgoing (connect) or incoming
1757 * (accept).
1758 *
1759 * Outgoing subflows use in-kernel sockets.
1760 * Incoming subflows do not have their own 'struct socket' allocated,
1761 * so we need to use tcp_close() after detaching them from the mptcp
1762 * parent socket.
1763 */
d0876b22 1764void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
e16163b6 1765 struct mptcp_subflow_context *subflow)
cec37a6e 1766{
e16163b6
PA
1767 bool dispose_socket = false;
1768 struct socket *sock;
cec37a6e
PK
1769
1770 list_del(&subflow->node);
1771
e16163b6
PA
1772 lock_sock(ssk);
1773
1774 /* if we are invoked by the msk cleanup code, the subflow is
1775 * already orphaned
1776 */
1777 sock = ssk->sk_socket;
1778 if (sock) {
1779 dispose_socket = sock != sk->sk_socket;
1780 sock_orphan(ssk);
1781 }
1782
1783 /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops
1784 * the ssk has been already destroyed, we just need to release the
1785 * reference owned by msk;
1786 */
1787 if (!inet_csk(ssk)->icsk_ulp_ops) {
1788 kfree_rcu(subflow, rcu);
cec37a6e 1789 } else {
e16163b6
PA
1790 /* otherwise ask tcp do dispose of ssk and subflow ctx */
1791 subflow->disposable = 1;
1792 __tcp_close(ssk, 0);
1793
1794 /* close acquired an extra ref */
1795 __sock_put(ssk);
cec37a6e 1796 }
e16163b6
PA
1797 release_sock(ssk);
1798 if (dispose_socket)
1799 iput(SOCK_INODE(sock));
1800
1801 sock_put(ssk);
f870fa0b
MM
1802}
1803
dc24f8b4
PA
1804static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
1805{
1806 return 0;
1807}
1808
b416268b
FW
1809static void pm_work(struct mptcp_sock *msk)
1810{
1811 struct mptcp_pm_data *pm = &msk->pm;
1812
1813 spin_lock_bh(&msk->pm.lock);
1814
1815 pr_debug("msk=%p status=%x", msk, pm->status);
1816 if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
1817 pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
1818 mptcp_pm_nl_add_addr_received(msk);
1819 }
84dfe367
GT
1820 if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) {
1821 pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK);
1822 mptcp_pm_nl_add_addr_send_ack(msk);
1823 }
d0876b22
GT
1824 if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) {
1825 pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED);
1826 mptcp_pm_nl_rm_addr_received(msk);
1827 }
b416268b
FW
1828 if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
1829 pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
1830 mptcp_pm_nl_fully_established(msk);
1831 }
1832 if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) {
1833 pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED);
1834 mptcp_pm_nl_subflow_established(msk);
1835 }
1836
1837 spin_unlock_bh(&msk->pm.lock);
1838}
1839
0e4f35d7
PA
1840static void __mptcp_close_subflow(struct mptcp_sock *msk)
1841{
1842 struct mptcp_subflow_context *subflow, *tmp;
1843
1844 list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
1845 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1846
1847 if (inet_sk_state_load(ssk) != TCP_CLOSE)
1848 continue;
1849
e16163b6 1850 __mptcp_close_ssk((struct sock *)msk, ssk, subflow);
0e4f35d7
PA
1851 }
1852}
1853
e16163b6
PA
1854static bool mptcp_check_close_timeout(const struct sock *sk)
1855{
1856 s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp;
1857 struct mptcp_subflow_context *subflow;
1858
1859 if (delta >= TCP_TIMEWAIT_LEN)
1860 return true;
1861
1862 /* if all subflows are in closed status don't bother with additional
1863 * timeout
1864 */
1865 mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
1866 if (inet_sk_state_load(mptcp_subflow_tcp_sock(subflow)) !=
1867 TCP_CLOSE)
1868 return false;
1869 }
1870 return true;
1871}
1872
80992017
PA
1873static void mptcp_worker(struct work_struct *work)
1874{
1875 struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
3b1d6210 1876 struct sock *ssk, *sk = &msk->sk.icsk_inet.sk;
caf971df 1877 struct mptcp_sendmsg_info info = {};
3b1d6210 1878 struct mptcp_data_frag *dfrag;
3b1d6210 1879 size_t copied = 0;
e16163b6 1880 int state, ret;
80992017
PA
1881
1882 lock_sock(sk);
e16163b6
PA
1883 state = sk->sk_state;
1884 if (unlikely(state == TCP_CLOSE))
1885 goto unlock;
1886
95ed690e 1887 mptcp_clean_una_wakeup(sk);
43b54c6e 1888 mptcp_check_data_fin_ack(sk);
ec3edaa7 1889 __mptcp_flush_join_list(msk);
0e4f35d7
PA
1890 if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
1891 __mptcp_close_subflow(msk);
1892
813e0a68
PA
1893 if (mptcp_send_head(sk))
1894 mptcp_push_pending(sk, 0);
3b1d6210 1895
b416268b
FW
1896 if (msk->pm.status)
1897 pm_work(msk);
1898
59832e24
FW
1899 if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
1900 mptcp_check_for_eof(msk);
1901
43b54c6e
MM
1902 mptcp_check_data_fin(sk);
1903
e16163b6
PA
1904 /* if the msk data is completely acked, or the socket timedout,
1905 * there is no point in keeping around an orphaned sk
1906 */
1907 if (sock_flag(sk, SOCK_DEAD) &&
1908 (mptcp_check_close_timeout(sk) ||
1909 (state != sk->sk_state &&
1910 ((1 << inet_sk_state_load(sk)) & (TCPF_CLOSE | TCPF_FIN_WAIT2))))) {
1911 inet_sk_state_store(sk, TCP_CLOSE);
1912 __mptcp_destroy_sock(sk);
1913 goto unlock;
1914 }
1915
3b1d6210
PA
1916 if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
1917 goto unlock;
1918
1919 dfrag = mptcp_rtx_head(sk);
1920 if (!dfrag)
1921 goto unlock;
1922
149f7c71
FW
1923 if (!mptcp_ext_cache_refill(msk))
1924 goto reset_unlock;
1925
3b1d6210
PA
1926 ssk = mptcp_subflow_get_retrans(msk);
1927 if (!ssk)
1928 goto reset_unlock;
1929
1930 lock_sock(ssk);
1931
d9ca1de8
PA
1932 /* limit retransmission to the bytes already sent on some subflows */
1933 info.sent = 0;
1934 info.limit = dfrag->already_sent;
1935 while (info.sent < dfrag->already_sent) {
1936 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
6f8a612a 1937 if (ret <= 0)
3b1d6210
PA
1938 break;
1939
fc518953 1940 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
3b1d6210 1941 copied += ret;
d9ca1de8 1942 info.sent += ret;
149f7c71
FW
1943
1944 if (!mptcp_ext_cache_refill(msk))
1945 break;
3b1d6210
PA
1946 }
1947 if (copied)
caf971df
PA
1948 tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
1949 info.size_goal);
3b1d6210 1950
3b1d6210
PA
1951 mptcp_set_timeout(sk, ssk);
1952 release_sock(ssk);
1953
1954reset_unlock:
1955 if (!mptcp_timer_pending(sk))
1956 mptcp_reset_timer(sk);
1957
1958unlock:
80992017
PA
1959 release_sock(sk);
1960 sock_put(sk);
1961}
1962
784325e9 1963static int __mptcp_init_sock(struct sock *sk)
f870fa0b 1964{
cec37a6e
PK
1965 struct mptcp_sock *msk = mptcp_sk(sk);
1966
ec3edaa7
PK
1967 spin_lock_init(&msk->join_list_lock);
1968
cec37a6e 1969 INIT_LIST_HEAD(&msk->conn_list);
ec3edaa7 1970 INIT_LIST_HEAD(&msk->join_list);
18b683bf 1971 INIT_LIST_HEAD(&msk->rtx_queue);
80992017 1972 INIT_WORK(&msk->work, mptcp_worker);
ab174ad8 1973 msk->out_of_order_queue = RB_ROOT;
f0e6a4cf 1974 msk->first_pending = NULL;
cec37a6e 1975
ea4ca586 1976 msk->ack_hint = NULL;
8ab183de 1977 msk->first = NULL;
dc24f8b4 1978 inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
8ab183de 1979
1b1c7a0e
PK
1980 mptcp_pm_data_init(msk);
1981
b51f9b80
PA
1982 /* re-use the csk retrans timer for MPTCP-level retrans */
1983 timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
e16163b6 1984 timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0);
f870fa0b
MM
1985 return 0;
1986}
1987
784325e9
MB
1988static int mptcp_init_sock(struct sock *sk)
1989{
fc518953
FW
1990 struct net *net = sock_net(sk);
1991 int ret;
18b683bf 1992
b6c08380
GT
1993 ret = __mptcp_init_sock(sk);
1994 if (ret)
1995 return ret;
1996
fc518953
FW
1997 if (!mptcp_is_enabled(net))
1998 return -ENOPROTOOPT;
1999
2000 if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net))
2001 return -ENOMEM;
2002
fa68018d
PA
2003 ret = __mptcp_socket_create(mptcp_sk(sk));
2004 if (ret)
2005 return ret;
2006
d027236c 2007 sk_sockets_allocated_inc(sk);
a6b118fe 2008 sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
da51aef5 2009 sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
d027236c 2010
18b683bf
PA
2011 return 0;
2012}
2013
2014static void __mptcp_clear_xmit(struct sock *sk)
2015{
2016 struct mptcp_sock *msk = mptcp_sk(sk);
2017 struct mptcp_data_frag *dtmp, *dfrag;
2018
b51f9b80
PA
2019 sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
2020
d9ca1de8 2021 WRITE_ONCE(msk->first_pending, NULL);
18b683bf 2022 list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list)
d027236c 2023 dfrag_clear(sk, dfrag);
784325e9
MB
2024}
2025
80992017
PA
2026static void mptcp_cancel_work(struct sock *sk)
2027{
2028 struct mptcp_sock *msk = mptcp_sk(sk);
2029
b2771d24 2030 if (cancel_work_sync(&msk->work))
e16163b6 2031 __sock_put(sk);
80992017
PA
2032}
2033
d0876b22 2034void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
21498490
PK
2035{
2036 lock_sock(ssk);
2037
2038 switch (ssk->sk_state) {
2039 case TCP_LISTEN:
2040 if (!(how & RCV_SHUTDOWN))
2041 break;
df561f66 2042 fallthrough;
21498490
PK
2043 case TCP_SYN_SENT:
2044 tcp_disconnect(ssk, O_NONBLOCK);
2045 break;
2046 default:
43b54c6e
MM
2047 if (__mptcp_check_fallback(mptcp_sk(sk))) {
2048 pr_debug("Fallback");
2049 ssk->sk_shutdown |= how;
2050 tcp_shutdown(ssk, how);
2051 } else {
2052 pr_debug("Sending DATA_FIN on subflow %p", ssk);
2053 mptcp_set_timeout(sk, ssk);
2054 tcp_send_ack(ssk);
2055 }
21498490
PK
2056 break;
2057 }
2058
21498490
PK
2059 release_sock(ssk);
2060}
2061
6920b851
MM
2062static const unsigned char new_state[16] = {
2063 /* current state: new state: action: */
2064 [0 /* (Invalid) */] = TCP_CLOSE,
2065 [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2066 [TCP_SYN_SENT] = TCP_CLOSE,
2067 [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2068 [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
2069 [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
2070 [TCP_TIME_WAIT] = TCP_CLOSE, /* should not happen ! */
2071 [TCP_CLOSE] = TCP_CLOSE,
2072 [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
2073 [TCP_LAST_ACK] = TCP_LAST_ACK,
2074 [TCP_LISTEN] = TCP_CLOSE,
2075 [TCP_CLOSING] = TCP_CLOSING,
2076 [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */
2077};
2078
2079static int mptcp_close_state(struct sock *sk)
2080{
2081 int next = (int)new_state[sk->sk_state];
2082 int ns = next & TCP_STATE_MASK;
2083
2084 inet_sk_state_store(sk, ns);
2085
2086 return next & TCP_ACTION_FIN;
2087}
2088
e16163b6 2089static void __mptcp_check_send_data_fin(struct sock *sk)
f870fa0b 2090{
e16163b6 2091 struct mptcp_subflow_context *subflow;
f870fa0b
MM
2092 struct mptcp_sock *msk = mptcp_sk(sk);
2093
e16163b6
PA
2094 pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu",
2095 msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk),
2096 msk->snd_nxt, msk->write_seq);
43b54c6e 2097
e16163b6
PA
2098 /* we still need to enqueue subflows or not really shutting down,
2099 * skip this
2100 */
2101 if (!msk->snd_data_fin_enable || msk->snd_nxt + 1 != msk->write_seq ||
2102 mptcp_send_head(sk))
2103 return;
2104
2105 WRITE_ONCE(msk->snd_nxt, msk->write_seq);
2106
26aa2314
PA
2107 /* fallback socket will not get data_fin/ack, can move to the next
2108 * state now
2109 */
2110 if (__mptcp_check_fallback(msk)) {
2111 if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) {
2112 inet_sk_state_store(sk, TCP_CLOSE);
2113 mptcp_close_wake_up(sk);
2114 } else if (sk->sk_state == TCP_FIN_WAIT1) {
2115 inet_sk_state_store(sk, TCP_FIN_WAIT2);
2116 }
43b54c6e
MM
2117 }
2118
e16163b6
PA
2119 __mptcp_flush_join_list(msk);
2120 mptcp_for_each_subflow(msk, subflow) {
2121 struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
43b54c6e 2122
e16163b6 2123 mptcp_subflow_shutdown(sk, tcp_sk, SEND_SHUTDOWN);
43b54c6e 2124 }
e16163b6 2125}
2c22c06c 2126
e16163b6
PA
2127static void __mptcp_wr_shutdown(struct sock *sk)
2128{
2129 struct mptcp_sock *msk = mptcp_sk(sk);
43b54c6e 2130
e16163b6
PA
2131 pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d",
2132 msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state,
2133 !!mptcp_send_head(sk));
2134
2135 /* will be ignored by fallback sockets */
2136 WRITE_ONCE(msk->write_seq, msk->write_seq + 1);
2137 WRITE_ONCE(msk->snd_data_fin_enable, 1);
2138
2139 __mptcp_check_send_data_fin(sk);
2140}
2141
2142static void __mptcp_destroy_sock(struct sock *sk)
2143{
2144 struct mptcp_subflow_context *subflow, *tmp;
2145 struct mptcp_sock *msk = mptcp_sk(sk);
2146 LIST_HEAD(conn_list);
2147
2148 pr_debug("msk=%p", msk);
f870fa0b 2149
10f6d46c
PA
2150 /* be sure to always acquire the join list lock, to sync vs
2151 * mptcp_finish_join().
2152 */
2153 spin_lock_bh(&msk->join_list_lock);
2154 list_splice_tail_init(&msk->join_list, &msk->conn_list);
2155 spin_unlock_bh(&msk->join_list_lock);
b2c5b614
FW
2156 list_splice_init(&msk->conn_list, &conn_list);
2157
18b683bf 2158 __mptcp_clear_xmit(sk);
e16163b6
PA
2159 sk_stop_timer(sk, &sk->sk_timer);
2160 msk->pm.status = 0;
b2c5b614
FW
2161
2162 list_for_each_entry_safe(subflow, tmp, &conn_list, node) {
cec37a6e 2163 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
e16163b6 2164 __mptcp_close_ssk(sk, ssk, subflow);
f870fa0b
MM
2165 }
2166
e16163b6 2167 sk->sk_prot->destroy(sk);
80992017 2168
e16163b6
PA
2169 sk_stream_kill_queues(sk);
2170 xfrm_sk_free_policy(sk);
2171 sk_refcnt_debug_release(sk);
2172 sock_put(sk);
2173}
2174
2175static void mptcp_close(struct sock *sk, long timeout)
2176{
2177 struct mptcp_subflow_context *subflow;
2178 bool do_cancel_work = false;
2179
2180 lock_sock(sk);
2181 sk->sk_shutdown = SHUTDOWN_MASK;
2182
2183 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) {
2184 inet_sk_state_store(sk, TCP_CLOSE);
2185 goto cleanup;
2186 }
6771bfd9 2187
e16163b6
PA
2188 if (mptcp_close_state(sk))
2189 __mptcp_wr_shutdown(sk);
2190
2191 sk_stream_wait_close(sk, timeout);
2192
2193cleanup:
2194 /* orphan all the subflows */
2195 inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32;
2196 list_for_each_entry(subflow, &mptcp_sk(sk)->conn_list, node) {
2197 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
2198 bool slow, dispose_socket;
2199 struct socket *sock;
2200
2201 slow = lock_sock_fast(ssk);
2202 sock = ssk->sk_socket;
2203 dispose_socket = sock && sock != sk->sk_socket;
2204 sock_orphan(ssk);
2205 unlock_sock_fast(ssk, slow);
2206
2207 /* for the outgoing subflows we additionally need to free
2208 * the associated socket
2209 */
2210 if (dispose_socket)
2211 iput(SOCK_INODE(sock));
2212 }
2213 sock_orphan(sk);
2214
2215 sock_hold(sk);
2216 pr_debug("msk=%p state=%d", sk, sk->sk_state);
2217 if (sk->sk_state == TCP_CLOSE) {
2218 __mptcp_destroy_sock(sk);
2219 do_cancel_work = true;
2220 } else {
2221 sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN);
2222 }
2223 release_sock(sk);
2224 if (do_cancel_work)
2225 mptcp_cancel_work(sk);
2226 sock_put(sk);
f870fa0b
MM
2227}
2228
cf7da0d6
PK
2229static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
2230{
2231#if IS_ENABLED(CONFIG_MPTCP_IPV6)
2232 const struct ipv6_pinfo *ssk6 = inet6_sk(ssk);
2233 struct ipv6_pinfo *msk6 = inet6_sk(msk);
2234
2235 msk->sk_v6_daddr = ssk->sk_v6_daddr;
2236 msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr;
2237
2238 if (msk6 && ssk6) {
2239 msk6->saddr = ssk6->saddr;
2240 msk6->flow_label = ssk6->flow_label;
2241 }
2242#endif
2243
2244 inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num;
2245 inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport;
2246 inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport;
2247 inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr;
2248 inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr;
2249 inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr;
2250}
2251
18b683bf
PA
2252static int mptcp_disconnect(struct sock *sk, int flags)
2253{
42c556fe
FW
2254 /* Should never be called.
2255 * inet_stream_connect() calls ->disconnect, but that
2256 * refers to the subflow socket, not the mptcp one.
2257 */
2258 WARN_ON_ONCE(1);
2259 return 0;
18b683bf
PA
2260}
2261
b0519de8
FW
2262#if IS_ENABLED(CONFIG_MPTCP_IPV6)
2263static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)
2264{
2265 unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo);
2266
2267 return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
2268}
2269#endif
2270
fca5c82c 2271struct sock *mptcp_sk_clone(const struct sock *sk,
cfde141e 2272 const struct mptcp_options_received *mp_opt,
fca5c82c 2273 struct request_sock *req)
b0519de8 2274{
58b09919 2275 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
b0519de8 2276 struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC);
58b09919
PA
2277 struct mptcp_sock *msk;
2278 u64 ack_seq;
b0519de8
FW
2279
2280 if (!nsk)
2281 return NULL;
2282
2283#if IS_ENABLED(CONFIG_MPTCP_IPV6)
2284 if (nsk->sk_family == AF_INET6)
2285 inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk);
2286#endif
2287
58b09919
PA
2288 __mptcp_init_sock(nsk);
2289
2290 msk = mptcp_sk(nsk);
2291 msk->local_key = subflow_req->local_key;
2292 msk->token = subflow_req->token;
2293 msk->subflow = NULL;
b93df08c 2294 WRITE_ONCE(msk->fully_established, false);
58b09919 2295
58b09919 2296 msk->write_seq = subflow_req->idsn + 1;
eaa2ffab 2297 msk->snd_nxt = msk->write_seq;
cc9d2566 2298 atomic64_set(&msk->snd_una, msk->write_seq);
6f8a612a
FW
2299 atomic64_set(&msk->wnd_end, msk->snd_nxt + req->rsk_rcv_wnd);
2300
cfde141e 2301 if (mp_opt->mp_capable) {
58b09919 2302 msk->can_ack = true;
cfde141e 2303 msk->remote_key = mp_opt->sndr_key;
58b09919
PA
2304 mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
2305 ack_seq++;
917944da 2306 WRITE_ONCE(msk->ack_seq, ack_seq);
fa3fe2b1 2307 WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
58b09919 2308 }
7f20d5fc 2309
5e20087d 2310 sock_reset_flag(nsk, SOCK_RCU_FREE);
7f20d5fc
PA
2311 /* will be fully established after successful MPC subflow creation */
2312 inet_sk_state_store(nsk, TCP_SYN_RECV);
58b09919
PA
2313 bh_unlock_sock(nsk);
2314
2315 /* keep a single reference */
2316 __sock_put(nsk);
b0519de8
FW
2317 return nsk;
2318}
2319
a6b118fe
FW
2320void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
2321{
2322 const struct tcp_sock *tp = tcp_sk(ssk);
2323
2324 msk->rcvq_space.copied = 0;
2325 msk->rcvq_space.rtt_us = 0;
2326
2327 msk->rcvq_space.time = tp->tcp_mstamp;
2328
2329 /* initial rcv_space offering made to peer */
2330 msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
2331 TCP_INIT_CWND * tp->advmss);
2332 if (msk->rcvq_space.space == 0)
2333 msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
6f8a612a
FW
2334
2335 atomic64_set(&msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd);
a6b118fe
FW
2336}
2337
cf7da0d6
PK
2338static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
2339 bool kern)
2340{
2341 struct mptcp_sock *msk = mptcp_sk(sk);
2342 struct socket *listener;
2343 struct sock *newsk;
2344
2345 listener = __mptcp_nmpc_socket(msk);
2346 if (WARN_ON_ONCE(!listener)) {
2347 *err = -EINVAL;
2348 return NULL;
2349 }
2350
2351 pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk));
2352 newsk = inet_csk_accept(listener->sk, flags, err, kern);
2353 if (!newsk)
2354 return NULL;
2355
2356 pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk));
cf7da0d6
PK
2357 if (sk_is_mptcp(newsk)) {
2358 struct mptcp_subflow_context *subflow;
2359 struct sock *new_mptcp_sock;
cf7da0d6
PK
2360
2361 subflow = mptcp_subflow_ctx(newsk);
58b09919 2362 new_mptcp_sock = subflow->conn;
cf7da0d6 2363
58b09919
PA
2364 /* is_mptcp should be false if subflow->conn is missing, see
2365 * subflow_syn_recv_sock()
2366 */
2367 if (WARN_ON_ONCE(!new_mptcp_sock)) {
2368 tcp_sk(newsk)->is_mptcp = 0;
2369 return newsk;
cf7da0d6
PK
2370 }
2371
58b09919
PA
2372 /* acquire the 2nd reference for the owning socket */
2373 sock_hold(new_mptcp_sock);
cf7da0d6 2374 newsk = new_mptcp_sock;
0397c6d8 2375 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
fc518953
FW
2376 } else {
2377 MPTCP_INC_STATS(sock_net(sk),
2378 MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
cf7da0d6
PK
2379 }
2380
2381 return newsk;
2382}
2383
5c8c1640
GT
2384void mptcp_destroy_common(struct mptcp_sock *msk)
2385{
2386 skb_rbtree_purge(&msk->out_of_order_queue);
2387 mptcp_token_destroy(msk);
2388 mptcp_pm_free_anno_list(msk);
2389}
2390
79c0949e
PK
2391static void mptcp_destroy(struct sock *sk)
2392{
c9fd9c5f
FW
2393 struct mptcp_sock *msk = mptcp_sk(sk);
2394
2395 if (msk->cached_ext)
2396 __skb_ext_put(msk->cached_ext);
d027236c 2397
5c8c1640 2398 mptcp_destroy_common(msk);
d027236c 2399 sk_sockets_allocated_dec(sk);
79c0949e
PK
2400}
2401
fd1452d8 2402static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
a7b75c5a 2403 sockptr_t optval, unsigned int optlen)
fd1452d8
FW
2404{
2405 struct sock *sk = (struct sock *)msk;
2406 struct socket *ssock;
2407 int ret;
2408
2409 switch (optname) {
2410 case SO_REUSEPORT:
2411 case SO_REUSEADDR:
2412 lock_sock(sk);
2413 ssock = __mptcp_nmpc_socket(msk);
2414 if (!ssock) {
2415 release_sock(sk);
2416 return -EINVAL;
2417 }
2418
a7b75c5a 2419 ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen);
fd1452d8
FW
2420 if (ret == 0) {
2421 if (optname == SO_REUSEPORT)
2422 sk->sk_reuseport = ssock->sk->sk_reuseport;
2423 else if (optname == SO_REUSEADDR)
2424 sk->sk_reuse = ssock->sk->sk_reuse;
2425 }
2426 release_sock(sk);
2427 return ret;
2428 }
2429
a7b75c5a 2430 return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen);
fd1452d8
FW
2431}
2432
c9b95a13 2433static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
a7b75c5a 2434 sockptr_t optval, unsigned int optlen)
c9b95a13
FW
2435{
2436 struct sock *sk = (struct sock *)msk;
2437 int ret = -EOPNOTSUPP;
2438 struct socket *ssock;
2439
2440 switch (optname) {
2441 case IPV6_V6ONLY:
2442 lock_sock(sk);
2443 ssock = __mptcp_nmpc_socket(msk);
2444 if (!ssock) {
2445 release_sock(sk);
2446 return -EINVAL;
2447 }
2448
2449 ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen);
2450 if (ret == 0)
2451 sk->sk_ipv6only = ssock->sk->sk_ipv6only;
2452
2453 release_sock(sk);
2454 break;
2455 }
2456
2457 return ret;
2458}
2459
717e79c8 2460static int mptcp_setsockopt(struct sock *sk, int level, int optname,
a7b75c5a 2461 sockptr_t optval, unsigned int optlen)
717e79c8
PK
2462{
2463 struct mptcp_sock *msk = mptcp_sk(sk);
76660afb 2464 struct sock *ssk;
717e79c8
PK
2465
2466 pr_debug("msk=%p", msk);
2467
83f0c10b 2468 if (level == SOL_SOCKET)
fd1452d8 2469 return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen);
83f0c10b 2470
717e79c8 2471 /* @@ the meaning of setsockopt() when the socket is connected and
b6e4a1ae
MM
2472 * there are multiple subflows is not yet defined. It is up to the
2473 * MPTCP-level socket to configure the subflows until the subflow
2474 * is in TCP fallback, when TCP socket options are passed through
2475 * to the one remaining subflow.
717e79c8
PK
2476 */
2477 lock_sock(sk);
76660afb 2478 ssk = __mptcp_tcp_fallback(msk);
e154659b 2479 release_sock(sk);
76660afb
PA
2480 if (ssk)
2481 return tcp_setsockopt(ssk, level, optname, optval, optlen);
50e741bb 2482
c9b95a13
FW
2483 if (level == SOL_IPV6)
2484 return mptcp_setsockopt_v6(msk, optname, optval, optlen);
2485
b6e4a1ae 2486 return -EOPNOTSUPP;
717e79c8
PK
2487}
2488
2489static int mptcp_getsockopt(struct sock *sk, int level, int optname,
50e741bb 2490 char __user *optval, int __user *option)
717e79c8
PK
2491{
2492 struct mptcp_sock *msk = mptcp_sk(sk);
76660afb 2493 struct sock *ssk;
717e79c8
PK
2494
2495 pr_debug("msk=%p", msk);
2496
b6e4a1ae
MM
2497 /* @@ the meaning of setsockopt() when the socket is connected and
2498 * there are multiple subflows is not yet defined. It is up to the
2499 * MPTCP-level socket to configure the subflows until the subflow
2500 * is in TCP fallback, when socket options are passed through
2501 * to the one remaining subflow.
717e79c8
PK
2502 */
2503 lock_sock(sk);
76660afb 2504 ssk = __mptcp_tcp_fallback(msk);
e154659b 2505 release_sock(sk);
76660afb
PA
2506 if (ssk)
2507 return tcp_getsockopt(ssk, level, optname, optval, option);
50e741bb 2508
b6e4a1ae 2509 return -EOPNOTSUPP;
717e79c8
PK
2510}
2511
ea4ca586 2512#define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED)
14c441b5
PA
2513
2514/* this is very alike tcp_release_cb() but we must handle differently a
2515 * different set of events
2516 */
2517static void mptcp_release_cb(struct sock *sk)
2518{
2519 unsigned long flags, nflags;
2520
2521 do {
2522 flags = sk->sk_tsq_flags;
2523 if (!(flags & MPTCP_DEFERRED_ALL))
2524 return;
2525 nflags = flags & ~MPTCP_DEFERRED_ALL;
2526 } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
2527
b51f9b80
PA
2528 sock_release_ownership(sk);
2529
b51f9b80
PA
2530 if (flags & TCPF_WRITE_TIMER_DEFERRED) {
2531 mptcp_retransmit_handler(sk);
2532 __sock_put(sk);
2533 }
14c441b5
PA
2534}
2535
2c5ebd00
PA
2536static int mptcp_hash(struct sock *sk)
2537{
2538 /* should never be called,
2539 * we hash the TCP subflows not the master socket
2540 */
2541 WARN_ON_ONCE(1);
2542 return 0;
2543}
2544
2545static void mptcp_unhash(struct sock *sk)
2546{
2547 /* called from sk_common_release(), but nothing to do here */
2548}
2549
cec37a6e 2550static int mptcp_get_port(struct sock *sk, unsigned short snum)
f870fa0b
MM
2551{
2552 struct mptcp_sock *msk = mptcp_sk(sk);
cec37a6e 2553 struct socket *ssock;
f870fa0b 2554
cec37a6e
PK
2555 ssock = __mptcp_nmpc_socket(msk);
2556 pr_debug("msk=%p, subflow=%p", msk, ssock);
2557 if (WARN_ON_ONCE(!ssock))
2558 return -EINVAL;
f870fa0b 2559
cec37a6e
PK
2560 return inet_csk_get_port(ssock->sk, snum);
2561}
f870fa0b 2562
cec37a6e
PK
2563void mptcp_finish_connect(struct sock *ssk)
2564{
2565 struct mptcp_subflow_context *subflow;
2566 struct mptcp_sock *msk;
2567 struct sock *sk;
6d0060f6 2568 u64 ack_seq;
f870fa0b 2569
cec37a6e 2570 subflow = mptcp_subflow_ctx(ssk);
cec37a6e
PK
2571 sk = subflow->conn;
2572 msk = mptcp_sk(sk);
2573
648ef4b8
MM
2574 pr_debug("msk=%p, token=%u", sk, subflow->token);
2575
6d0060f6
MM
2576 mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
2577 ack_seq++;
648ef4b8
MM
2578 subflow->map_seq = ack_seq;
2579 subflow->map_subflow_seq = 1;
6d0060f6 2580
cec37a6e
PK
2581 /* the socket is not connected yet, no msk/subflow ops can access/race
2582 * accessing the field below
2583 */
2584 WRITE_ONCE(msk->remote_key, subflow->remote_key);
2585 WRITE_ONCE(msk->local_key, subflow->local_key);
6d0060f6 2586 WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
eaa2ffab 2587 WRITE_ONCE(msk->snd_nxt, msk->write_seq);
6d0060f6 2588 WRITE_ONCE(msk->ack_seq, ack_seq);
fa3fe2b1 2589 WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
d22f4988 2590 WRITE_ONCE(msk->can_ack, 1);
cc9d2566 2591 atomic64_set(&msk->snd_una, msk->write_seq);
1b1c7a0e
PK
2592
2593 mptcp_pm_new_connection(msk, 0);
a6b118fe
FW
2594
2595 mptcp_rcv_space_init(msk, ssk);
f870fa0b
MM
2596}
2597
cf7da0d6
PK
2598static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
2599{
2600 write_lock_bh(&sk->sk_callback_lock);
2601 rcu_assign_pointer(sk->sk_wq, &parent->wq);
2602 sk_set_socket(sk, parent);
2603 sk->sk_uid = SOCK_INODE(parent)->i_uid;
2604 write_unlock_bh(&sk->sk_callback_lock);
2605}
2606
e16163b6 2607bool mptcp_finish_join(struct sock *ssk)
f296234c 2608{
e16163b6 2609 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
f296234c
PK
2610 struct mptcp_sock *msk = mptcp_sk(subflow->conn);
2611 struct sock *parent = (void *)msk;
2612 struct socket *parent_sock;
ec3edaa7 2613 bool ret;
f296234c
PK
2614
2615 pr_debug("msk=%p, subflow=%p", msk, subflow);
2616
2617 /* mptcp socket already closing? */
b93df08c 2618 if (!mptcp_is_fully_established(parent))
f296234c
PK
2619 return false;
2620
2621 if (!msk->pm.server_side)
2622 return true;
2623
10f6d46c
PA
2624 if (!mptcp_pm_allow_new_subflow(msk))
2625 return false;
2626
2627 /* active connections are already on conn_list, and we can't acquire
2628 * msk lock here.
2629 * use the join list lock as synchronization point and double-check
e16163b6 2630 * msk status to avoid racing with __mptcp_destroy_sock()
10f6d46c
PA
2631 */
2632 spin_lock_bh(&msk->join_list_lock);
2633 ret = inet_sk_state_load(parent) == TCP_ESTABLISHED;
e16163b6 2634 if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) {
10f6d46c 2635 list_add_tail(&subflow->node, &msk->join_list);
e16163b6
PA
2636 sock_hold(ssk);
2637 }
10f6d46c
PA
2638 spin_unlock_bh(&msk->join_list_lock);
2639 if (!ret)
2640 return false;
2641
2642 /* attach to msk socket only after we are sure he will deal with us
2643 * at close time
2644 */
f296234c 2645 parent_sock = READ_ONCE(parent->sk_socket);
e16163b6
PA
2646 if (parent_sock && !ssk->sk_socket)
2647 mptcp_sock_graft(ssk, parent_sock);
917944da 2648 subflow->map_seq = READ_ONCE(msk->ack_seq);
10f6d46c 2649 return true;
f296234c
PK
2650}
2651
f870fa0b
MM
2652static struct proto mptcp_prot = {
2653 .name = "MPTCP",
2654 .owner = THIS_MODULE,
2655 .init = mptcp_init_sock,
18b683bf 2656 .disconnect = mptcp_disconnect,
f870fa0b 2657 .close = mptcp_close,
cf7da0d6 2658 .accept = mptcp_accept,
717e79c8
PK
2659 .setsockopt = mptcp_setsockopt,
2660 .getsockopt = mptcp_getsockopt,
f870fa0b 2661 .shutdown = tcp_shutdown,
79c0949e 2662 .destroy = mptcp_destroy,
f870fa0b
MM
2663 .sendmsg = mptcp_sendmsg,
2664 .recvmsg = mptcp_recvmsg,
14c441b5 2665 .release_cb = mptcp_release_cb,
2c5ebd00
PA
2666 .hash = mptcp_hash,
2667 .unhash = mptcp_unhash,
cec37a6e 2668 .get_port = mptcp_get_port,
d027236c
PA
2669 .sockets_allocated = &mptcp_sockets_allocated,
2670 .memory_allocated = &tcp_memory_allocated,
2671 .memory_pressure = &tcp_memory_pressure,
d027236c 2672 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
989ef49b 2673 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
d027236c 2674 .sysctl_mem = sysctl_tcp_mem,
f870fa0b 2675 .obj_size = sizeof(struct mptcp_sock),
2c5ebd00 2676 .slab_flags = SLAB_TYPESAFE_BY_RCU,
f870fa0b
MM
2677 .no_autobind = true,
2678};
2679
2303f994
PK
2680static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2681{
2682 struct mptcp_sock *msk = mptcp_sk(sock->sk);
2683 struct socket *ssock;
cf7da0d6 2684 int err;
2303f994
PK
2685
2686 lock_sock(sock->sk);
fa68018d
PA
2687 ssock = __mptcp_nmpc_socket(msk);
2688 if (!ssock) {
2689 err = -EINVAL;
2303f994
PK
2690 goto unlock;
2691 }
2692
2693 err = ssock->ops->bind(ssock, uaddr, addr_len);
cf7da0d6
PK
2694 if (!err)
2695 mptcp_copy_inaddrs(sock->sk, ssock->sk);
2303f994
PK
2696
2697unlock:
2698 release_sock(sock->sk);
2699 return err;
2700}
2701
0235d075
PA
2702static void mptcp_subflow_early_fallback(struct mptcp_sock *msk,
2703 struct mptcp_subflow_context *subflow)
2704{
2705 subflow->request_mptcp = 0;
2706 __mptcp_do_fallback(msk);
2707}
2708
2303f994
PK
2709static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
2710 int addr_len, int flags)
2711{
2712 struct mptcp_sock *msk = mptcp_sk(sock->sk);
2c5ebd00 2713 struct mptcp_subflow_context *subflow;
2303f994
PK
2714 struct socket *ssock;
2715 int err;
2716
2717 lock_sock(sock->sk);
41be81a8
PA
2718 if (sock->state != SS_UNCONNECTED && msk->subflow) {
2719 /* pending connection or invalid state, let existing subflow
2720 * cope with that
2721 */
2722 ssock = msk->subflow;
2723 goto do_connect;
2724 }
2725
fa68018d
PA
2726 ssock = __mptcp_nmpc_socket(msk);
2727 if (!ssock) {
2728 err = -EINVAL;
2303f994
PK
2729 goto unlock;
2730 }
2731
fa68018d
PA
2732 mptcp_token_destroy(msk);
2733 inet_sk_state_store(sock->sk, TCP_SYN_SENT);
2c5ebd00 2734 subflow = mptcp_subflow_ctx(ssock->sk);
cf7da0d6
PK
2735#ifdef CONFIG_TCP_MD5SIG
2736 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
2737 * TCP option space.
2738 */
2739 if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info))
0235d075 2740 mptcp_subflow_early_fallback(msk, subflow);
cf7da0d6 2741#endif
2c5ebd00 2742 if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk))
0235d075 2743 mptcp_subflow_early_fallback(msk, subflow);
cf7da0d6 2744
41be81a8 2745do_connect:
2303f994 2746 err = ssock->ops->connect(ssock, uaddr, addr_len, flags);
41be81a8
PA
2747 sock->state = ssock->state;
2748
2749 /* on successful connect, the msk state will be moved to established by
2750 * subflow_finish_connect()
2751 */
367fe04e 2752 if (!err || err == -EINPROGRESS)
41be81a8
PA
2753 mptcp_copy_inaddrs(sock->sk, ssock->sk);
2754 else
2755 inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
2303f994
PK
2756
2757unlock:
2758 release_sock(sock->sk);
2759 return err;
2760}
2761
cf7da0d6
PK
2762static int mptcp_listen(struct socket *sock, int backlog)
2763{
2764 struct mptcp_sock *msk = mptcp_sk(sock->sk);
2765 struct socket *ssock;
2766 int err;
2767
2768 pr_debug("msk=%p", msk);
2769
2770 lock_sock(sock->sk);
fa68018d
PA
2771 ssock = __mptcp_nmpc_socket(msk);
2772 if (!ssock) {
2773 err = -EINVAL;
cf7da0d6
PK
2774 goto unlock;
2775 }
2776
fa68018d
PA
2777 mptcp_token_destroy(msk);
2778 inet_sk_state_store(sock->sk, TCP_LISTEN);
5e20087d
FW
2779 sock_set_flag(sock->sk, SOCK_RCU_FREE);
2780
cf7da0d6
PK
2781 err = ssock->ops->listen(ssock, backlog);
2782 inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
2783 if (!err)
2784 mptcp_copy_inaddrs(sock->sk, ssock->sk);
2785
2786unlock:
2787 release_sock(sock->sk);
2788 return err;
2789}
2790
cf7da0d6
PK
2791static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
2792 int flags, bool kern)
2793{
2794 struct mptcp_sock *msk = mptcp_sk(sock->sk);
2795 struct socket *ssock;
2796 int err;
2797
2798 pr_debug("msk=%p", msk);
2799
2800 lock_sock(sock->sk);
2801 if (sock->sk->sk_state != TCP_LISTEN)
2802 goto unlock_fail;
2803
2804 ssock = __mptcp_nmpc_socket(msk);
2805 if (!ssock)
2806 goto unlock_fail;
2807
8a05661b 2808 clear_bit(MPTCP_DATA_READY, &msk->flags);
cf7da0d6
PK
2809 sock_hold(ssock->sk);
2810 release_sock(sock->sk);
2811
2812 err = ssock->ops->accept(sock, newsock, flags, kern);
d2f77c53 2813 if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) {
cf7da0d6
PK
2814 struct mptcp_sock *msk = mptcp_sk(newsock->sk);
2815 struct mptcp_subflow_context *subflow;
0397c6d8
PA
2816 struct sock *newsk = newsock->sk;
2817 bool slowpath;
2818
2819 slowpath = lock_sock_fast(newsk);
2820 mptcp_copy_inaddrs(newsk, msk->first);
2821 mptcp_rcv_space_init(msk, msk->first);
cf7da0d6
PK
2822
2823 /* set ssk->sk_socket of accept()ed flows to mptcp socket.
2824 * This is needed so NOSPACE flag can be set from tcp stack.
2825 */
ec3edaa7 2826 __mptcp_flush_join_list(msk);
190f8b06 2827 mptcp_for_each_subflow(msk, subflow) {
cf7da0d6
PK
2828 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
2829
2830 if (!ssk->sk_socket)
2831 mptcp_sock_graft(ssk, newsock);
2832 }
0397c6d8 2833 unlock_sock_fast(newsk, slowpath);
cf7da0d6
PK
2834 }
2835
8a05661b
PA
2836 if (inet_csk_listen_poll(ssock->sk))
2837 set_bit(MPTCP_DATA_READY, &msk->flags);
cf7da0d6
PK
2838 sock_put(ssock->sk);
2839 return err;
2840
2841unlock_fail:
2842 release_sock(sock->sk);
2843 return -EINVAL;
2844}
2845
8a05661b
PA
2846static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
2847{
2848 return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM :
2849 0;
2850}
2851
8edf0864
FW
2852static bool __mptcp_check_writeable(struct mptcp_sock *msk)
2853{
2854 struct sock *sk = (struct sock *)msk;
2855 bool mptcp_writable;
2856
2857 mptcp_clean_una(sk);
2858 mptcp_writable = sk_stream_is_writeable(sk);
2859 if (!mptcp_writable)
2860 mptcp_nospace(msk);
2861
2862 return mptcp_writable;
2863}
2864
2865static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
2866{
2867 struct sock *sk = (struct sock *)msk;
2868 __poll_t ret = 0;
2869 bool slow;
2870
2871 if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN))
2872 return 0;
2873
2874 if (sk_stream_is_writeable(sk))
2875 return EPOLLOUT | EPOLLWRNORM;
2876
2877 slow = lock_sock_fast(sk);
2878 if (__mptcp_check_writeable(msk))
2879 ret = EPOLLOUT | EPOLLWRNORM;
2880
2881 unlock_sock_fast(sk, slow);
2882 return ret;
2883}
2884
2303f994
PK
2885static __poll_t mptcp_poll(struct file *file, struct socket *sock,
2886 struct poll_table_struct *wait)
2887{
1891c4a0 2888 struct sock *sk = sock->sk;
8ab183de 2889 struct mptcp_sock *msk;
2303f994 2890 __poll_t mask = 0;
8a05661b 2891 int state;
2303f994 2892
1891c4a0 2893 msk = mptcp_sk(sk);
1891c4a0 2894 sock_poll_wait(file, sock, wait);
1891c4a0 2895
8a05661b 2896 state = inet_sk_state_load(sk);
6719331c 2897 pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags);
8a05661b
PA
2898 if (state == TCP_LISTEN)
2899 return mptcp_check_readable(msk);
2900
2901 if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
2902 mask |= mptcp_check_readable(msk);
8edf0864 2903 mask |= mptcp_check_writeable(msk);
8a05661b 2904 }
1891c4a0
FW
2905 if (sk->sk_shutdown & RCV_SHUTDOWN)
2906 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
2907
2303f994
PK
2908 return mask;
2909}
2910
21498490
PK
2911static int mptcp_shutdown(struct socket *sock, int how)
2912{
2913 struct mptcp_sock *msk = mptcp_sk(sock->sk);
e16163b6 2914 struct sock *sk = sock->sk;
21498490
PK
2915 int ret = 0;
2916
2917 pr_debug("sk=%p, how=%d", msk, how);
2918
e16163b6 2919 lock_sock(sk);
21498490
PK
2920
2921 how++;
21498490
PK
2922 if ((how & ~SHUTDOWN_MASK) || !how) {
2923 ret = -EINVAL;
2924 goto out_unlock;
2925 }
2926
2927 if (sock->state == SS_CONNECTING) {
e16163b6 2928 if ((1 << sk->sk_state) &
21498490
PK
2929 (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
2930 sock->state = SS_DISCONNECTING;
2931 else
2932 sock->state = SS_CONNECTED;
2933 }
2934
e16163b6
PA
2935 sk->sk_shutdown |= how;
2936 if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk))
2937 __mptcp_wr_shutdown(sk);
21498490 2938
e1ff9e82 2939 /* Wake up anyone sleeping in poll. */
e16163b6 2940 sk->sk_state_change(sk);
e1ff9e82 2941
21498490 2942out_unlock:
e16163b6 2943 release_sock(sk);
21498490
PK
2944
2945 return ret;
2946}
2947
e42f1ac6
FW
2948static const struct proto_ops mptcp_stream_ops = {
2949 .family = PF_INET,
2950 .owner = THIS_MODULE,
2951 .release = inet_release,
2952 .bind = mptcp_bind,
2953 .connect = mptcp_stream_connect,
2954 .socketpair = sock_no_socketpair,
2955 .accept = mptcp_stream_accept,
d2f77c53 2956 .getname = inet_getname,
e42f1ac6
FW
2957 .poll = mptcp_poll,
2958 .ioctl = inet_ioctl,
2959 .gettstamp = sock_gettstamp,
2960 .listen = mptcp_listen,
2961 .shutdown = mptcp_shutdown,
2962 .setsockopt = sock_common_setsockopt,
2963 .getsockopt = sock_common_getsockopt,
2964 .sendmsg = inet_sendmsg,
2965 .recvmsg = inet_recvmsg,
2966 .mmap = sock_no_mmap,
2967 .sendpage = inet_sendpage,
e42f1ac6 2968};
2303f994 2969
f870fa0b
MM
2970static struct inet_protosw mptcp_protosw = {
2971 .type = SOCK_STREAM,
2972 .protocol = IPPROTO_MPTCP,
2973 .prot = &mptcp_prot,
2303f994
PK
2974 .ops = &mptcp_stream_ops,
2975 .flags = INET_PROTOSW_ICSK,
f870fa0b
MM
2976};
2977
d39dceca 2978void __init mptcp_proto_init(void)
f870fa0b 2979{
2303f994 2980 mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;
2303f994 2981
d027236c
PA
2982 if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL))
2983 panic("Failed to allocate MPTCP pcpu counter\n");
2984
2303f994 2985 mptcp_subflow_init();
1b1c7a0e 2986 mptcp_pm_init();
2c5ebd00 2987 mptcp_token_init();
2303f994 2988
f870fa0b
MM
2989 if (proto_register(&mptcp_prot, 1) != 0)
2990 panic("Failed to register MPTCP proto.\n");
2991
2992 inet_register_protosw(&mptcp_protosw);
6771bfd9
FW
2993
2994 BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb));
f870fa0b
MM
2995}
2996
2997#if IS_ENABLED(CONFIG_MPTCP_IPV6)
e42f1ac6
FW
2998static const struct proto_ops mptcp_v6_stream_ops = {
2999 .family = PF_INET6,
3000 .owner = THIS_MODULE,
3001 .release = inet6_release,
3002 .bind = mptcp_bind,
3003 .connect = mptcp_stream_connect,
3004 .socketpair = sock_no_socketpair,
3005 .accept = mptcp_stream_accept,
d2f77c53 3006 .getname = inet6_getname,
e42f1ac6
FW
3007 .poll = mptcp_poll,
3008 .ioctl = inet6_ioctl,
3009 .gettstamp = sock_gettstamp,
3010 .listen = mptcp_listen,
3011 .shutdown = mptcp_shutdown,
3012 .setsockopt = sock_common_setsockopt,
3013 .getsockopt = sock_common_getsockopt,
3014 .sendmsg = inet6_sendmsg,
3015 .recvmsg = inet6_recvmsg,
3016 .mmap = sock_no_mmap,
3017 .sendpage = inet_sendpage,
3018#ifdef CONFIG_COMPAT
3986912f 3019 .compat_ioctl = inet6_compat_ioctl,
e42f1ac6
FW
3020#endif
3021};
3022
f870fa0b
MM
3023static struct proto mptcp_v6_prot;
3024
79c0949e
PK
3025static void mptcp_v6_destroy(struct sock *sk)
3026{
3027 mptcp_destroy(sk);
3028 inet6_destroy_sock(sk);
3029}
3030
f870fa0b
MM
3031static struct inet_protosw mptcp_v6_protosw = {
3032 .type = SOCK_STREAM,
3033 .protocol = IPPROTO_MPTCP,
3034 .prot = &mptcp_v6_prot,
2303f994 3035 .ops = &mptcp_v6_stream_ops,
f870fa0b
MM
3036 .flags = INET_PROTOSW_ICSK,
3037};
3038
d39dceca 3039int __init mptcp_proto_v6_init(void)
f870fa0b
MM
3040{
3041 int err;
3042
3043 mptcp_v6_prot = mptcp_prot;
3044 strcpy(mptcp_v6_prot.name, "MPTCPv6");
3045 mptcp_v6_prot.slab = NULL;
79c0949e 3046 mptcp_v6_prot.destroy = mptcp_v6_destroy;
b0519de8 3047 mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock);
f870fa0b
MM
3048
3049 err = proto_register(&mptcp_v6_prot, 1);
3050 if (err)
3051 return err;
3052
3053 err = inet6_register_protosw(&mptcp_v6_protosw);
3054 if (err)
3055 proto_unregister(&mptcp_v6_prot);
3056
3057 return err;
3058}
3059#endif