net/tcp: switch do_tcp_setsockopt to sockptr_t
[linux-2.6-block.git] / net / mptcp / protocol.c
CommitLineData
f870fa0b
MM
1// SPDX-License-Identifier: GPL-2.0
2/* Multipath TCP
3 *
4 * Copyright (c) 2017 - 2019, Intel Corporation.
5 */
6
7#define pr_fmt(fmt) "MPTCP: " fmt
8
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/netdevice.h>
7a6a6cbc
PA
12#include <linux/sched/signal.h>
13#include <linux/atomic.h>
f870fa0b
MM
14#include <net/sock.h>
15#include <net/inet_common.h>
16#include <net/inet_hashtables.h>
17#include <net/protocol.h>
18#include <net/tcp.h>
cf7da0d6
PK
19#if IS_ENABLED(CONFIG_MPTCP_IPV6)
20#include <net/transp_v6.h>
21#endif
f870fa0b
MM
22#include <net/mptcp.h>
23#include "protocol.h"
fc518953 24#include "mib.h"
f870fa0b 25
2303f994
PK
26#define MPTCP_SAME_STATE TCP_MAX_STATES
27
b0519de8
FW
28#if IS_ENABLED(CONFIG_MPTCP_IPV6)
29struct mptcp6_sock {
30 struct mptcp_sock msk;
31 struct ipv6_pinfo np;
32};
33#endif
34
6771bfd9
FW
35struct mptcp_skb_cb {
36 u32 offset;
37};
38
39#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
40
d027236c
PA
41static struct percpu_counter mptcp_sockets_allocated;
42
2303f994
PK
43/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
44 * completed yet or has failed, return the subflow socket.
45 * Otherwise return NULL.
46 */
47static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
48{
d22f4988 49 if (!msk->subflow || READ_ONCE(msk->can_ack))
2303f994
PK
50 return NULL;
51
52 return msk->subflow;
53}
54
d2f77c53 55static bool mptcp_is_tcpsk(struct sock *sk)
0b4f33de
FW
56{
57 struct socket *sock = sk->sk_socket;
58
0b4f33de
FW
59 if (unlikely(sk->sk_prot == &tcp_prot)) {
60 /* we are being invoked after mptcp_accept() has
61 * accepted a non-mp-capable flow: sk is a tcp_sk,
62 * not an mptcp one.
63 *
64 * Hand the socket over to tcp so all further socket ops
65 * bypass mptcp.
66 */
67 sock->ops = &inet_stream_ops;
d2f77c53 68 return true;
0b4f33de
FW
69#if IS_ENABLED(CONFIG_MPTCP_IPV6)
70 } else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
71 sock->ops = &inet6_stream_ops;
d2f77c53 72 return true;
0b4f33de
FW
73#endif
74 }
75
d2f77c53 76 return false;
0b4f33de
FW
77}
78
76660afb 79static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk)
cec37a6e 80{
cec37a6e
PK
81 sock_owned_by_me((const struct sock *)msk);
82
e1ff9e82 83 if (likely(!__mptcp_check_fallback(msk)))
cec37a6e
PK
84 return NULL;
85
76660afb 86 return msk->first;
cec37a6e
PK
87}
88
fa68018d 89static int __mptcp_socket_create(struct mptcp_sock *msk)
2303f994
PK
90{
91 struct mptcp_subflow_context *subflow;
92 struct sock *sk = (struct sock *)msk;
93 struct socket *ssock;
94 int err;
95
2303f994
PK
96 err = mptcp_subflow_create_socket(sk, &ssock);
97 if (err)
fa68018d 98 return err;
2303f994 99
8ab183de 100 msk->first = ssock->sk;
2303f994
PK
101 msk->subflow = ssock;
102 subflow = mptcp_subflow_ctx(ssock->sk);
cec37a6e 103 list_add(&subflow->node, &msk->conn_list);
2303f994
PK
104 subflow->request_mptcp = 1;
105
e1ff9e82
DC
106 /* accept() will wait on first subflow sk_wq, and we always wakes up
107 * via msk->sk_socket
108 */
109 RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq);
110
fa68018d 111 return 0;
2303f994
PK
112}
113
6771bfd9
FW
114static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
115 struct sk_buff *skb,
116 unsigned int offset, size_t copy_len)
117{
118 struct sock *sk = (struct sock *)msk;
4e637c70 119 struct sk_buff *tail;
6771bfd9
FW
120
121 __skb_unlink(skb, &ssk->sk_receive_queue);
6771bfd9 122
4e637c70
FW
123 skb_ext_reset(skb);
124 skb_orphan(skb);
6771bfd9 125 msk->ack_seq += copy_len;
4e637c70
FW
126
127 tail = skb_peek_tail(&sk->sk_receive_queue);
128 if (offset == 0 && tail) {
129 bool fragstolen;
130 int delta;
131
132 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
133 kfree_skb_partial(skb, fragstolen);
134 atomic_add(delta, &sk->sk_rmem_alloc);
135 sk_mem_charge(sk, delta);
136 return;
137 }
138 }
139
140 skb_set_owner_r(skb, sk);
141 __skb_queue_tail(&sk->sk_receive_queue, skb);
6771bfd9
FW
142 MPTCP_SKB_CB(skb)->offset = offset;
143}
144
de06f573
FW
145/* both sockets must be locked */
146static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk,
147 struct sock *ssk)
148{
149 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
150 u64 dsn = mptcp_subflow_get_mapped_dsn(subflow);
151
152 /* revalidate data sequence number.
153 *
154 * mptcp_subflow_data_available() is usually called
155 * without msk lock. Its unlikely (but possible)
156 * that msk->ack_seq has been advanced since the last
157 * call found in-sequence data.
158 */
159 if (likely(dsn == msk->ack_seq))
160 return true;
161
162 subflow->data_avail = 0;
163 return mptcp_subflow_data_available(ssk);
164}
165
6771bfd9
FW
166static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
167 struct sock *ssk,
168 unsigned int *bytes)
169{
170 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
600911ff 171 struct sock *sk = (struct sock *)msk;
6771bfd9
FW
172 unsigned int moved = 0;
173 bool more_data_avail;
174 struct tcp_sock *tp;
175 bool done = false;
600911ff 176
de06f573
FW
177 if (!mptcp_subflow_dsn_valid(msk, ssk)) {
178 *bytes = 0;
179 return false;
180 }
181
6771bfd9
FW
182 tp = tcp_sk(ssk);
183 do {
184 u32 map_remaining, offset;
185 u32 seq = tp->copied_seq;
186 struct sk_buff *skb;
187 bool fin;
188
189 /* try to move as much data as available */
190 map_remaining = subflow->map_data_len -
191 mptcp_subflow_get_map_offset(subflow);
192
193 skb = skb_peek(&ssk->sk_receive_queue);
194 if (!skb)
195 break;
196
e1ff9e82
DC
197 if (__mptcp_check_fallback(msk)) {
198 /* if we are running under the workqueue, TCP could have
199 * collapsed skbs between dummy map creation and now
200 * be sure to adjust the size
201 */
202 map_remaining = skb->len;
203 subflow->map_data_len = skb->len;
204 }
205
6771bfd9
FW
206 offset = seq - TCP_SKB_CB(skb)->seq;
207 fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
208 if (fin) {
209 done = true;
210 seq++;
211 }
212
213 if (offset < skb->len) {
214 size_t len = skb->len - offset;
215
216 if (tp->urg_data)
217 done = true;
218
219 __mptcp_move_skb(msk, ssk, skb, offset, len);
220 seq += len;
221 moved += len;
222
223 if (WARN_ON_ONCE(map_remaining < len))
224 break;
225 } else {
226 WARN_ON_ONCE(!fin);
227 sk_eat_skb(ssk, skb);
228 done = true;
229 }
230
231 WRITE_ONCE(tp->copied_seq, seq);
232 more_data_avail = mptcp_subflow_data_available(ssk);
600911ff
FW
233
234 if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) {
235 done = true;
236 break;
237 }
6771bfd9
FW
238 } while (more_data_avail);
239
240 *bytes = moved;
241
242 return done;
243}
244
2e52213c
FW
245/* In most cases we will be able to lock the mptcp socket. If its already
246 * owned, we need to defer to the work queue to avoid ABBA deadlock.
247 */
248static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
249{
250 struct sock *sk = (struct sock *)msk;
251 unsigned int moved = 0;
252
253 if (READ_ONCE(sk->sk_lock.owned))
254 return false;
255
256 if (unlikely(!spin_trylock_bh(&sk->sk_lock.slock)))
257 return false;
258
259 /* must re-check after taking the lock */
260 if (!READ_ONCE(sk->sk_lock.owned))
261 __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
262
263 spin_unlock_bh(&sk->sk_lock.slock);
264
265 return moved > 0;
266}
267
268void mptcp_data_ready(struct sock *sk, struct sock *ssk)
101f6f85
FW
269{
270 struct mptcp_sock *msk = mptcp_sk(sk);
271
272 set_bit(MPTCP_DATA_READY, &msk->flags);
6771bfd9 273
2e52213c
FW
274 if (atomic_read(&sk->sk_rmem_alloc) < READ_ONCE(sk->sk_rcvbuf) &&
275 move_skbs_to_msk(msk, ssk))
276 goto wake;
277
600911ff
FW
278 /* don't schedule if mptcp sk is (still) over limit */
279 if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf))
280 goto wake;
281
14c441b5
PA
282 /* mptcp socket is owned, release_cb should retry */
283 if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
284 &sk->sk_tsq_flags)) {
285 sock_hold(sk);
6771bfd9 286
14c441b5
PA
287 /* need to try again, its possible release_cb() has already
288 * been called after the test_and_set_bit() above.
289 */
290 move_skbs_to_msk(msk, ssk);
291 }
600911ff 292wake:
101f6f85
FW
293 sk->sk_data_ready(sk);
294}
295
ec3edaa7
PK
296static void __mptcp_flush_join_list(struct mptcp_sock *msk)
297{
298 if (likely(list_empty(&msk->join_list)))
299 return;
300
301 spin_lock_bh(&msk->join_list_lock);
302 list_splice_tail_init(&msk->join_list, &msk->conn_list);
303 spin_unlock_bh(&msk->join_list_lock);
304}
305
b51f9b80
PA
306static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
307{
308 long tout = ssk && inet_csk(ssk)->icsk_pending ?
309 inet_csk(ssk)->icsk_timeout - jiffies : 0;
310
311 if (tout <= 0)
312 tout = mptcp_sk(sk)->timer_ival;
313 mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
314}
315
316static bool mptcp_timer_pending(struct sock *sk)
317{
318 return timer_pending(&inet_csk(sk)->icsk_retransmit_timer);
319}
320
321static void mptcp_reset_timer(struct sock *sk)
322{
323 struct inet_connection_sock *icsk = inet_csk(sk);
324 unsigned long tout;
325
326 /* should never be called with mptcp level timer cleared */
327 tout = READ_ONCE(mptcp_sk(sk)->timer_ival);
328 if (WARN_ON_ONCE(!tout))
329 tout = TCP_RTO_MIN;
330 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout);
331}
332
333void mptcp_data_acked(struct sock *sk)
334{
335 mptcp_reset_timer(sk);
3b1d6210
PA
336
337 if (!sk_stream_is_writeable(sk) &&
338 schedule_work(&mptcp_sk(sk)->work))
339 sock_hold(sk);
b51f9b80
PA
340}
341
59832e24
FW
342void mptcp_subflow_eof(struct sock *sk)
343{
344 struct mptcp_sock *msk = mptcp_sk(sk);
345
346 if (!test_and_set_bit(MPTCP_WORK_EOF, &msk->flags) &&
347 schedule_work(&msk->work))
348 sock_hold(sk);
349}
350
5969856a
PA
351static void mptcp_check_for_eof(struct mptcp_sock *msk)
352{
353 struct mptcp_subflow_context *subflow;
354 struct sock *sk = (struct sock *)msk;
355 int receivers = 0;
356
357 mptcp_for_each_subflow(msk, subflow)
358 receivers += !subflow->rx_eof;
359
360 if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
361 /* hopefully temporary hack: propagate shutdown status
362 * to msk, when all subflows agree on it
363 */
364 sk->sk_shutdown |= RCV_SHUTDOWN;
365
366 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
367 set_bit(MPTCP_DATA_READY, &msk->flags);
368 sk->sk_data_ready(sk);
369 }
370}
371
b51f9b80
PA
372static void mptcp_stop_timer(struct sock *sk)
373{
374 struct inet_connection_sock *icsk = inet_csk(sk);
375
376 sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
377 mptcp_sk(sk)->timer_ival = 0;
378}
379
6d0060f6
MM
380static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
381{
4930f483
FW
382 const struct sock *sk = (const struct sock *)msk;
383
6d0060f6 384 if (!msk->cached_ext)
4930f483 385 msk->cached_ext = __skb_ext_alloc(sk->sk_allocation);
6d0060f6
MM
386
387 return !!msk->cached_ext;
388}
389
7a6a6cbc
PA
390static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
391{
392 struct mptcp_subflow_context *subflow;
393 struct sock *sk = (struct sock *)msk;
394
395 sock_owned_by_me(sk);
396
397 mptcp_for_each_subflow(msk, subflow) {
398 if (subflow->data_avail)
399 return mptcp_subflow_tcp_sock(subflow);
400 }
401
402 return NULL;
403}
404
3f8e0aae
PA
405static bool mptcp_skb_can_collapse_to(u64 write_seq,
406 const struct sk_buff *skb,
407 const struct mptcp_ext *mpext)
57040755
PA
408{
409 if (!tcp_skb_can_collapse_to(skb))
410 return false;
411
412 /* can collapse only if MPTCP level sequence is in order */
3f8e0aae 413 return mpext && mpext->data_seq + mpext->data_len == write_seq;
57040755
PA
414}
415
18b683bf
PA
416static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
417 const struct page_frag *pfrag,
418 const struct mptcp_data_frag *df)
419{
420 return df && pfrag->page == df->page &&
421 df->data_seq + df->data_len == msk->write_seq;
422}
423
d027236c
PA
424static void dfrag_uncharge(struct sock *sk, int len)
425{
426 sk_mem_uncharge(sk, len);
7948f6cc 427 sk_wmem_queued_add(sk, -len);
d027236c
PA
428}
429
430static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
18b683bf 431{
d027236c
PA
432 int len = dfrag->data_len + dfrag->overhead;
433
18b683bf 434 list_del(&dfrag->list);
d027236c 435 dfrag_uncharge(sk, len);
18b683bf
PA
436 put_page(dfrag->page);
437}
438
439static void mptcp_clean_una(struct sock *sk)
440{
441 struct mptcp_sock *msk = mptcp_sk(sk);
442 struct mptcp_data_frag *dtmp, *dfrag;
d027236c 443 bool cleaned = false;
e1ff9e82
DC
444 u64 snd_una;
445
446 /* on fallback we just need to ignore snd_una, as this is really
447 * plain TCP
448 */
449 if (__mptcp_check_fallback(msk))
450 atomic64_set(&msk->snd_una, msk->write_seq);
451 snd_una = atomic64_read(&msk->snd_una);
18b683bf
PA
452
453 list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
454 if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
455 break;
456
d027236c
PA
457 dfrag_clear(sk, dfrag);
458 cleaned = true;
459 }
460
7948f6cc
FW
461 dfrag = mptcp_rtx_head(sk);
462 if (dfrag && after64(snd_una, dfrag->data_seq)) {
53eb4c38
PA
463 u64 delta = snd_una - dfrag->data_seq;
464
465 if (WARN_ON_ONCE(delta > dfrag->data_len))
466 goto out;
7948f6cc
FW
467
468 dfrag->data_seq += delta;
53eb4c38 469 dfrag->offset += delta;
7948f6cc
FW
470 dfrag->data_len -= delta;
471
472 dfrag_uncharge(sk, delta);
473 cleaned = true;
474 }
475
53eb4c38 476out:
d027236c
PA
477 if (cleaned) {
478 sk_mem_reclaim_partial(sk);
7948f6cc
FW
479
480 /* Only wake up writers if a subflow is ready */
481 if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
482 sk_stream_write_space(sk);
18b683bf
PA
483 }
484}
485
486/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
487 * data
488 */
489static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
490{
491 if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
492 pfrag, sk->sk_allocation)))
493 return true;
494
495 sk->sk_prot->enter_memory_pressure(sk);
496 sk_stream_moderate_sndbuf(sk);
497 return false;
498}
499
500static struct mptcp_data_frag *
501mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
502 int orig_offset)
503{
504 int offset = ALIGN(orig_offset, sizeof(long));
505 struct mptcp_data_frag *dfrag;
506
507 dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset);
508 dfrag->data_len = 0;
509 dfrag->data_seq = msk->write_seq;
510 dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag);
511 dfrag->offset = offset + sizeof(struct mptcp_data_frag);
512 dfrag->page = pfrag->page;
513
514 return dfrag;
515}
516
6d0060f6 517static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
3f8e0aae
PA
518 struct msghdr *msg, struct mptcp_data_frag *dfrag,
519 long *timeo, int *pmss_now,
57040755 520 int *ps_goal)
6d0060f6 521{
18b683bf
PA
522 int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0;
523 bool dfrag_collapsed, can_collapse = false;
6d0060f6
MM
524 struct mptcp_sock *msk = mptcp_sk(sk);
525 struct mptcp_ext *mpext = NULL;
3f8e0aae 526 bool retransmission = !!dfrag;
57040755 527 struct sk_buff *skb, *tail;
6d0060f6 528 struct page_frag *pfrag;
3f8e0aae
PA
529 struct page *page;
530 u64 *write_seq;
6d0060f6
MM
531 size_t psize;
532
533 /* use the mptcp page cache so that we can easily move the data
534 * from one substream to another, but do per subflow memory accounting
3f8e0aae
PA
535 * Note: pfrag is used only !retransmission, but the compiler if
536 * fooled into a warning if we don't init here
6d0060f6
MM
537 */
538 pfrag = sk_page_frag(sk);
3f8e0aae
PA
539 if (!retransmission) {
540 write_seq = &msk->write_seq;
541 page = pfrag->page;
542 } else {
543 write_seq = &dfrag->data_seq;
544 page = dfrag->page;
545 }
6d0060f6
MM
546
547 /* compute copy limit */
548 mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
57040755
PA
549 *pmss_now = mss_now;
550 *ps_goal = size_goal;
551 avail_size = size_goal;
552 skb = tcp_write_queue_tail(ssk);
553 if (skb) {
554 mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
555
556 /* Limit the write to the size available in the
557 * current skb, if any, so that we create at most a new skb.
558 * Explicitly tells TCP internals to avoid collapsing on later
559 * queue management operation, to avoid breaking the ext <->
560 * SSN association set here
561 */
562 can_collapse = (size_goal - skb->len > 0) &&
3f8e0aae 563 mptcp_skb_can_collapse_to(*write_seq, skb, mpext);
57040755
PA
564 if (!can_collapse)
565 TCP_SKB_CB(skb)->eor = 1;
566 else
567 avail_size = size_goal - skb->len;
568 }
18b683bf 569
3f8e0aae
PA
570 if (!retransmission) {
571 /* reuse tail pfrag, if possible, or carve a new one from the
572 * page allocator
573 */
574 dfrag = mptcp_rtx_tail(sk);
575 offset = pfrag->offset;
576 dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
577 if (!dfrag_collapsed) {
578 dfrag = mptcp_carve_data_frag(msk, pfrag, offset);
579 offset = dfrag->offset;
580 frag_truesize = dfrag->overhead;
581 }
582 psize = min_t(size_t, pfrag->size - offset, avail_size);
583
584 /* Copy to page */
585 pr_debug("left=%zu", msg_data_left(msg));
586 psize = copy_page_from_iter(pfrag->page, offset,
587 min_t(size_t, msg_data_left(msg),
588 psize),
589 &msg->msg_iter);
590 pr_debug("left=%zu", msg_data_left(msg));
591 if (!psize)
592 return -EINVAL;
593
594 if (!sk_wmem_schedule(sk, psize + dfrag->overhead))
595 return -ENOMEM;
596 } else {
18b683bf 597 offset = dfrag->offset;
3f8e0aae 598 psize = min_t(size_t, dfrag->data_len, avail_size);
18b683bf 599 }
d027236c 600
57040755
PA
601 /* tell the TCP stack to delay the push so that we can safely
602 * access the skb after the sendpages call
6d0060f6 603 */
3f8e0aae 604 ret = do_tcp_sendpages(ssk, page, offset, psize,
72511aab 605 msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT);
6d0060f6
MM
606 if (ret <= 0)
607 return ret;
18b683bf
PA
608
609 frag_truesize += ret;
3f8e0aae
PA
610 if (!retransmission) {
611 if (unlikely(ret < psize))
612 iov_iter_revert(&msg->msg_iter, psize - ret);
6d0060f6 613
3f8e0aae
PA
614 /* send successful, keep track of sent data for mptcp-level
615 * retransmission
616 */
617 dfrag->data_len += ret;
618 if (!dfrag_collapsed) {
619 get_page(dfrag->page);
620 list_add_tail(&dfrag->list, &msk->rtx_queue);
621 sk_wmem_queued_add(sk, frag_truesize);
622 } else {
623 sk_wmem_queued_add(sk, ret);
624 }
18b683bf 625
3f8e0aae
PA
626 /* charge data on mptcp rtx queue to the master socket
627 * Note: we charge such data both to sk and ssk
628 */
629 sk->sk_forward_alloc -= frag_truesize;
630 }
d027236c 631
57040755
PA
632 /* if the tail skb extension is still the cached one, collapsing
633 * really happened. Note: we can't check for 'same skb' as the sk_buff
634 * hdr on tail can be transmitted, freed and re-allocated by the
635 * do_tcp_sendpages() call
636 */
637 tail = tcp_write_queue_tail(ssk);
638 if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) {
639 WARN_ON_ONCE(!can_collapse);
640 mpext->data_len += ret;
641 goto out;
642 }
643
6d0060f6
MM
644 skb = tcp_write_queue_tail(ssk);
645 mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext);
646 msk->cached_ext = NULL;
647
648 memset(mpext, 0, sizeof(*mpext));
3f8e0aae 649 mpext->data_seq = *write_seq;
6d0060f6
MM
650 mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
651 mpext->data_len = ret;
652 mpext->use_map = 1;
653 mpext->dsn64 = 1;
654
655 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d",
656 mpext->data_seq, mpext->subflow_seq, mpext->data_len,
657 mpext->dsn64);
658
57040755 659out:
3f8e0aae
PA
660 if (!retransmission)
661 pfrag->offset += frag_truesize;
662 *write_seq += ret;
6d0060f6
MM
663 mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
664
6d0060f6
MM
665 return ret;
666}
667
a0e17064
FW
668static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock)
669{
670 clear_bit(MPTCP_SEND_SPACE, &msk->flags);
671 smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
672
673 /* enables sk->write_space() callbacks */
674 set_bit(SOCK_NOSPACE, &sock->flags);
675}
676
f296234c
PK
677static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
678{
679 struct mptcp_subflow_context *subflow;
680 struct sock *backup = NULL;
681
682 sock_owned_by_me((const struct sock *)msk);
683
149f7c71
FW
684 if (!mptcp_ext_cache_refill(msk))
685 return NULL;
686
f296234c
PK
687 mptcp_for_each_subflow(msk, subflow) {
688 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
689
690 if (!sk_stream_memory_free(ssk)) {
691 struct socket *sock = ssk->sk_socket;
692
a0e17064
FW
693 if (sock)
694 mptcp_nospace(msk, sock);
f296234c
PK
695
696 return NULL;
697 }
698
699 if (subflow->backup) {
700 if (!backup)
701 backup = ssk;
702
703 continue;
704 }
705
706 return ssk;
707 }
708
709 return backup;
710}
711
1891c4a0
FW
712static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
713{
714 struct socket *sock;
715
716 if (likely(sk_stream_is_writeable(ssk)))
717 return;
718
719 sock = READ_ONCE(ssk->sk_socket);
a0e17064
FW
720 if (sock)
721 mptcp_nospace(msk, sock);
1891c4a0
FW
722}
723
f870fa0b
MM
724static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
725{
57040755 726 int mss_now = 0, size_goal = 0, ret = 0;
f870fa0b 727 struct mptcp_sock *msk = mptcp_sk(sk);
17091708 728 struct page_frag *pfrag;
6d0060f6 729 size_t copied = 0;
cec37a6e 730 struct sock *ssk;
72511aab 731 bool tx_ok;
6d0060f6 732 long timeo;
f870fa0b
MM
733
734 if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
735 return -EOPNOTSUPP;
736
cec37a6e 737 lock_sock(sk);
1954b860
MM
738
739 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
740
741 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
742 ret = sk_stream_wait_connect(sk, &timeo);
743 if (ret)
744 goto out;
745 }
746
17091708 747 pfrag = sk_page_frag(sk);
72511aab 748restart:
18b683bf
PA
749 mptcp_clean_una(sk);
750
fb529e62 751wait_for_sndbuf:
ec3edaa7 752 __mptcp_flush_join_list(msk);
f296234c 753 ssk = mptcp_subflow_get_send(msk);
17091708
FW
754 while (!sk_stream_memory_free(sk) ||
755 !ssk ||
756 !mptcp_page_frag_refill(ssk, pfrag)) {
fb529e62
FW
757 if (ssk) {
758 /* make sure retransmit timer is
759 * running before we wait for memory.
760 *
761 * The retransmit timer might be needed
762 * to make the peer send an up-to-date
763 * MPTCP Ack.
764 */
765 mptcp_set_timeout(sk, ssk);
766 if (!mptcp_timer_pending(sk))
767 mptcp_reset_timer(sk);
768 }
769
f296234c
PK
770 ret = sk_stream_wait_memory(sk, &timeo);
771 if (ret)
772 goto out;
773
18b683bf
PA
774 mptcp_clean_una(sk);
775
f296234c
PK
776 ssk = mptcp_subflow_get_send(msk);
777 if (list_empty(&msk->conn_list)) {
778 ret = -ENOTCONN;
779 goto out;
780 }
cec37a6e
PK
781 }
782
6d0060f6 783 pr_debug("conn_list->subflow=%p", ssk);
cec37a6e 784
6d0060f6 785 lock_sock(ssk);
72511aab
FW
786 tx_ok = msg_data_left(msg);
787 while (tx_ok) {
3f8e0aae 788 ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now,
57040755 789 &size_goal);
72511aab
FW
790 if (ret < 0) {
791 if (ret == -EAGAIN && timeo > 0) {
792 mptcp_set_timeout(sk, ssk);
793 release_sock(ssk);
794 goto restart;
795 }
6d0060f6 796 break;
72511aab 797 }
6d0060f6
MM
798
799 copied += ret;
fb529e62 800
72511aab
FW
801 tx_ok = msg_data_left(msg);
802 if (!tx_ok)
803 break;
804
149f7c71 805 if (!sk_stream_memory_free(ssk) ||
17091708 806 !mptcp_page_frag_refill(ssk, pfrag) ||
149f7c71 807 !mptcp_ext_cache_refill(msk)) {
72511aab
FW
808 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
809 tcp_push(ssk, msg->msg_flags, mss_now,
810 tcp_sk(ssk)->nonagle, size_goal);
811 mptcp_set_timeout(sk, ssk);
812 release_sock(ssk);
813 goto restart;
814 }
815
fb529e62
FW
816 /* memory is charged to mptcp level socket as well, i.e.
817 * if msg is very large, mptcp socket may run out of buffer
818 * space. mptcp_clean_una() will release data that has
819 * been acked at mptcp level in the mean time, so there is
820 * a good chance we can continue sending data right away.
72511aab
FW
821 *
822 * Normally, when the tcp subflow can accept more data, then
823 * so can the MPTCP socket. However, we need to cope with
824 * peers that might lag behind in their MPTCP-level
825 * acknowledgements, i.e. data might have been acked at
826 * tcp level only. So, we must also check the MPTCP socket
827 * limits before we send more data.
fb529e62
FW
828 */
829 if (unlikely(!sk_stream_memory_free(sk))) {
830 tcp_push(ssk, msg->msg_flags, mss_now,
831 tcp_sk(ssk)->nonagle, size_goal);
832 mptcp_clean_una(sk);
833 if (!sk_stream_memory_free(sk)) {
834 /* can't send more for now, need to wait for
835 * MPTCP-level ACKs from peer.
836 *
837 * Wakeup will happen via mptcp_clean_una().
838 */
839 mptcp_set_timeout(sk, ssk);
840 release_sock(ssk);
841 goto wait_for_sndbuf;
842 }
843 }
6d0060f6
MM
844 }
845
b51f9b80 846 mptcp_set_timeout(sk, ssk);
57040755 847 if (copied) {
6d0060f6 848 ret = copied;
57040755
PA
849 tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle,
850 size_goal);
b51f9b80
PA
851
852 /* start the timer, if it's not pending */
853 if (!mptcp_timer_pending(sk))
854 mptcp_reset_timer(sk);
57040755 855 }
6d0060f6 856
1891c4a0 857 ssk_check_wmem(msk, ssk);
6d0060f6 858 release_sock(ssk);
1954b860 859out:
cec37a6e
PK
860 release_sock(sk);
861 return ret;
f870fa0b
MM
862}
863
7a6a6cbc
PA
864static void mptcp_wait_data(struct sock *sk, long *timeo)
865{
866 DEFINE_WAIT_FUNC(wait, woken_wake_function);
867 struct mptcp_sock *msk = mptcp_sk(sk);
868
869 add_wait_queue(sk_sleep(sk), &wait);
870 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
871
872 sk_wait_event(sk, timeo,
873 test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait);
874
875 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
876 remove_wait_queue(sk_sleep(sk), &wait);
877}
878
6771bfd9
FW
879static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
880 struct msghdr *msg,
881 size_t len)
882{
883 struct sock *sk = (struct sock *)msk;
884 struct sk_buff *skb;
885 int copied = 0;
886
887 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
888 u32 offset = MPTCP_SKB_CB(skb)->offset;
889 u32 data_len = skb->len - offset;
890 u32 count = min_t(size_t, len - copied, data_len);
891 int err;
892
893 err = skb_copy_datagram_msg(skb, offset, msg, count);
894 if (unlikely(err < 0)) {
895 if (!copied)
896 return err;
897 break;
898 }
899
900 copied += count;
901
902 if (count < data_len) {
903 MPTCP_SKB_CB(skb)->offset += count;
904 break;
905 }
906
907 __skb_unlink(skb, &sk->sk_receive_queue);
908 __kfree_skb(skb);
909
910 if (copied >= len)
911 break;
912 }
913
914 return copied;
915}
916
a6b118fe
FW
917/* receive buffer autotuning. See tcp_rcv_space_adjust for more information.
918 *
919 * Only difference: Use highest rtt estimate of the subflows in use.
920 */
921static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
922{
923 struct mptcp_subflow_context *subflow;
924 struct sock *sk = (struct sock *)msk;
925 u32 time, advmss = 1;
926 u64 rtt_us, mstamp;
927
928 sock_owned_by_me(sk);
929
930 if (copied <= 0)
931 return;
932
933 msk->rcvq_space.copied += copied;
934
935 mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC);
936 time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time);
937
938 rtt_us = msk->rcvq_space.rtt_us;
939 if (rtt_us && time < (rtt_us >> 3))
940 return;
941
942 rtt_us = 0;
943 mptcp_for_each_subflow(msk, subflow) {
944 const struct tcp_sock *tp;
945 u64 sf_rtt_us;
946 u32 sf_advmss;
947
948 tp = tcp_sk(mptcp_subflow_tcp_sock(subflow));
949
950 sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us);
951 sf_advmss = READ_ONCE(tp->advmss);
952
953 rtt_us = max(sf_rtt_us, rtt_us);
954 advmss = max(sf_advmss, advmss);
955 }
956
957 msk->rcvq_space.rtt_us = rtt_us;
958 if (time < (rtt_us >> 3) || rtt_us == 0)
959 return;
960
961 if (msk->rcvq_space.copied <= msk->rcvq_space.space)
962 goto new_measure;
963
964 if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
965 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
966 int rcvmem, rcvbuf;
967 u64 rcvwin, grow;
968
969 rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
970
971 grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space);
972
973 do_div(grow, msk->rcvq_space.space);
974 rcvwin += (grow << 1);
975
976 rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER);
977 while (tcp_win_from_space(sk, rcvmem) < advmss)
978 rcvmem += 128;
979
980 do_div(rcvwin, advmss);
981 rcvbuf = min_t(u64, rcvwin * rcvmem,
982 sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
983
984 if (rcvbuf > sk->sk_rcvbuf) {
985 u32 window_clamp;
986
987 window_clamp = tcp_win_from_space(sk, rcvbuf);
988 WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
989
990 /* Make subflows follow along. If we do not do this, we
991 * get drops at subflow level if skbs can't be moved to
992 * the mptcp rx queue fast enough (announced rcv_win can
993 * exceed ssk->sk_rcvbuf).
994 */
995 mptcp_for_each_subflow(msk, subflow) {
996 struct sock *ssk;
997
998 ssk = mptcp_subflow_tcp_sock(subflow);
999 WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
1000 tcp_sk(ssk)->window_clamp = window_clamp;
1001 }
1002 }
1003 }
1004
1005 msk->rcvq_space.space = msk->rcvq_space.copied;
1006new_measure:
1007 msk->rcvq_space.copied = 0;
1008 msk->rcvq_space.time = mstamp;
1009}
1010
6771bfd9
FW
1011static bool __mptcp_move_skbs(struct mptcp_sock *msk)
1012{
1013 unsigned int moved = 0;
1014 bool done;
1015
1016 do {
1017 struct sock *ssk = mptcp_subflow_recv_lookup(msk);
1018
1019 if (!ssk)
1020 break;
1021
1022 lock_sock(ssk);
1023 done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
1024 release_sock(ssk);
1025 } while (!done);
1026
1027 return moved > 0;
1028}
1029
f870fa0b
MM
1030static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
1031 int nonblock, int flags, int *addr_len)
1032{
1033 struct mptcp_sock *msk = mptcp_sk(sk);
cec37a6e 1034 int copied = 0;
7a6a6cbc
PA
1035 int target;
1036 long timeo;
f870fa0b
MM
1037
1038 if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT))
1039 return -EOPNOTSUPP;
1040
cec37a6e 1041 lock_sock(sk);
7a6a6cbc
PA
1042 timeo = sock_rcvtimeo(sk, nonblock);
1043
1044 len = min_t(size_t, len, INT_MAX);
1045 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
ec3edaa7 1046 __mptcp_flush_join_list(msk);
7a6a6cbc 1047
6771bfd9 1048 while (len > (size_t)copied) {
7a6a6cbc
PA
1049 int bytes_read;
1050
6771bfd9
FW
1051 bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
1052 if (unlikely(bytes_read < 0)) {
1053 if (!copied)
1054 copied = bytes_read;
1055 goto out_err;
1056 }
7a6a6cbc 1057
6771bfd9 1058 copied += bytes_read;
7a6a6cbc 1059
6771bfd9
FW
1060 if (skb_queue_empty(&sk->sk_receive_queue) &&
1061 __mptcp_move_skbs(msk))
1062 continue;
7a6a6cbc
PA
1063
1064 /* only the master socket status is relevant here. The exit
1065 * conditions mirror closely tcp_recvmsg()
1066 */
1067 if (copied >= target)
1068 break;
1069
1070 if (copied) {
1071 if (sk->sk_err ||
1072 sk->sk_state == TCP_CLOSE ||
1073 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1074 !timeo ||
1075 signal_pending(current))
1076 break;
1077 } else {
1078 if (sk->sk_err) {
1079 copied = sock_error(sk);
1080 break;
1081 }
1082
5969856a
PA
1083 if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
1084 mptcp_check_for_eof(msk);
1085
7a6a6cbc
PA
1086 if (sk->sk_shutdown & RCV_SHUTDOWN)
1087 break;
1088
1089 if (sk->sk_state == TCP_CLOSE) {
1090 copied = -ENOTCONN;
1091 break;
1092 }
1093
1094 if (!timeo) {
1095 copied = -EAGAIN;
1096 break;
1097 }
1098
1099 if (signal_pending(current)) {
1100 copied = sock_intr_errno(timeo);
1101 break;
1102 }
1103 }
1104
1105 pr_debug("block timeout %ld", timeo);
7a6a6cbc 1106 mptcp_wait_data(sk, &timeo);
cec37a6e
PK
1107 }
1108
6771bfd9
FW
1109 if (skb_queue_empty(&sk->sk_receive_queue)) {
1110 /* entire backlog drained, clear DATA_READY. */
7a6a6cbc 1111 clear_bit(MPTCP_DATA_READY, &msk->flags);
cec37a6e 1112
6771bfd9
FW
1113 /* .. race-breaker: ssk might have gotten new data
1114 * after last __mptcp_move_skbs() returned false.
7a6a6cbc 1115 */
6771bfd9 1116 if (unlikely(__mptcp_move_skbs(msk)))
7a6a6cbc 1117 set_bit(MPTCP_DATA_READY, &msk->flags);
6771bfd9
FW
1118 } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) {
1119 /* data to read but mptcp_wait_data() cleared DATA_READY */
1120 set_bit(MPTCP_DATA_READY, &msk->flags);
7a6a6cbc 1121 }
6771bfd9 1122out_err:
a6b118fe
FW
1123 mptcp_rcv_space_adjust(msk, copied);
1124
7a6a6cbc 1125 release_sock(sk);
cec37a6e
PK
1126 return copied;
1127}
1128
b51f9b80
PA
1129static void mptcp_retransmit_handler(struct sock *sk)
1130{
1131 struct mptcp_sock *msk = mptcp_sk(sk);
1132
3b1d6210 1133 if (atomic64_read(&msk->snd_una) == msk->write_seq) {
b51f9b80 1134 mptcp_stop_timer(sk);
3b1d6210
PA
1135 } else {
1136 set_bit(MPTCP_WORK_RTX, &msk->flags);
1137 if (schedule_work(&msk->work))
1138 sock_hold(sk);
1139 }
b51f9b80
PA
1140}
1141
1142static void mptcp_retransmit_timer(struct timer_list *t)
1143{
1144 struct inet_connection_sock *icsk = from_timer(icsk, t,
1145 icsk_retransmit_timer);
1146 struct sock *sk = &icsk->icsk_inet.sk;
1147
1148 bh_lock_sock(sk);
1149 if (!sock_owned_by_user(sk)) {
1150 mptcp_retransmit_handler(sk);
1151 } else {
1152 /* delegate our work to tcp_release_cb() */
1153 if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED,
1154 &sk->sk_tsq_flags))
1155 sock_hold(sk);
1156 }
1157 bh_unlock_sock(sk);
1158 sock_put(sk);
1159}
1160
3b1d6210
PA
1161/* Find an idle subflow. Return NULL if there is unacked data at tcp
1162 * level.
1163 *
1164 * A backup subflow is returned only if that is the only kind available.
1165 */
1166static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
1167{
1168 struct mptcp_subflow_context *subflow;
1169 struct sock *backup = NULL;
1170
1171 sock_owned_by_me((const struct sock *)msk);
1172
1173 mptcp_for_each_subflow(msk, subflow) {
1174 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1175
1176 /* still data outstanding at TCP level? Don't retransmit. */
1177 if (!tcp_write_queue_empty(ssk))
1178 return NULL;
1179
1180 if (subflow->backup) {
1181 if (!backup)
1182 backup = ssk;
1183 continue;
1184 }
1185
1186 return ssk;
1187 }
1188
1189 return backup;
1190}
1191
cec37a6e
PK
1192/* subflow sockets can be either outgoing (connect) or incoming
1193 * (accept).
1194 *
1195 * Outgoing subflows use in-kernel sockets.
1196 * Incoming subflows do not have their own 'struct socket' allocated,
1197 * so we need to use tcp_close() after detaching them from the mptcp
1198 * parent socket.
1199 */
1200static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
1201 struct mptcp_subflow_context *subflow,
1202 long timeout)
1203{
1204 struct socket *sock = READ_ONCE(ssk->sk_socket);
1205
1206 list_del(&subflow->node);
1207
1208 if (sock && sock != sk->sk_socket) {
1209 /* outgoing subflow */
1210 sock_release(sock);
1211 } else {
1212 /* incoming subflow */
1213 tcp_close(ssk, timeout);
1214 }
f870fa0b
MM
1215}
1216
dc24f8b4
PA
1217static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
1218{
1219 return 0;
1220}
1221
b416268b
FW
1222static void pm_work(struct mptcp_sock *msk)
1223{
1224 struct mptcp_pm_data *pm = &msk->pm;
1225
1226 spin_lock_bh(&msk->pm.lock);
1227
1228 pr_debug("msk=%p status=%x", msk, pm->status);
1229 if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
1230 pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
1231 mptcp_pm_nl_add_addr_received(msk);
1232 }
1233 if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
1234 pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
1235 mptcp_pm_nl_fully_established(msk);
1236 }
1237 if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) {
1238 pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED);
1239 mptcp_pm_nl_subflow_established(msk);
1240 }
1241
1242 spin_unlock_bh(&msk->pm.lock);
1243}
1244
80992017
PA
1245static void mptcp_worker(struct work_struct *work)
1246{
1247 struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
3b1d6210 1248 struct sock *ssk, *sk = &msk->sk.icsk_inet.sk;
149f7c71 1249 int orig_len, orig_offset, mss_now = 0, size_goal = 0;
3b1d6210
PA
1250 struct mptcp_data_frag *dfrag;
1251 u64 orig_write_seq;
1252 size_t copied = 0;
1253 struct msghdr msg;
1254 long timeo = 0;
80992017
PA
1255
1256 lock_sock(sk);
3b1d6210 1257 mptcp_clean_una(sk);
ec3edaa7 1258 __mptcp_flush_join_list(msk);
6771bfd9 1259 __mptcp_move_skbs(msk);
3b1d6210 1260
b416268b
FW
1261 if (msk->pm.status)
1262 pm_work(msk);
1263
59832e24
FW
1264 if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
1265 mptcp_check_for_eof(msk);
1266
3b1d6210
PA
1267 if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
1268 goto unlock;
1269
1270 dfrag = mptcp_rtx_head(sk);
1271 if (!dfrag)
1272 goto unlock;
1273
149f7c71
FW
1274 if (!mptcp_ext_cache_refill(msk))
1275 goto reset_unlock;
1276
3b1d6210
PA
1277 ssk = mptcp_subflow_get_retrans(msk);
1278 if (!ssk)
1279 goto reset_unlock;
1280
1281 lock_sock(ssk);
1282
1283 msg.msg_flags = MSG_DONTWAIT;
1284 orig_len = dfrag->data_len;
1285 orig_offset = dfrag->offset;
1286 orig_write_seq = dfrag->data_seq;
1287 while (dfrag->data_len > 0) {
149f7c71
FW
1288 int ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo,
1289 &mss_now, &size_goal);
3b1d6210
PA
1290 if (ret < 0)
1291 break;
1292
fc518953 1293 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
3b1d6210
PA
1294 copied += ret;
1295 dfrag->data_len -= ret;
1296 dfrag->offset += ret;
149f7c71
FW
1297
1298 if (!mptcp_ext_cache_refill(msk))
1299 break;
3b1d6210
PA
1300 }
1301 if (copied)
1302 tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle,
1303 size_goal);
1304
1305 dfrag->data_seq = orig_write_seq;
1306 dfrag->offset = orig_offset;
1307 dfrag->data_len = orig_len;
1308
1309 mptcp_set_timeout(sk, ssk);
1310 release_sock(ssk);
1311
1312reset_unlock:
1313 if (!mptcp_timer_pending(sk))
1314 mptcp_reset_timer(sk);
1315
1316unlock:
80992017
PA
1317 release_sock(sk);
1318 sock_put(sk);
1319}
1320
784325e9 1321static int __mptcp_init_sock(struct sock *sk)
f870fa0b 1322{
cec37a6e
PK
1323 struct mptcp_sock *msk = mptcp_sk(sk);
1324
ec3edaa7
PK
1325 spin_lock_init(&msk->join_list_lock);
1326
cec37a6e 1327 INIT_LIST_HEAD(&msk->conn_list);
ec3edaa7 1328 INIT_LIST_HEAD(&msk->join_list);
18b683bf 1329 INIT_LIST_HEAD(&msk->rtx_queue);
1891c4a0 1330 __set_bit(MPTCP_SEND_SPACE, &msk->flags);
80992017 1331 INIT_WORK(&msk->work, mptcp_worker);
cec37a6e 1332
8ab183de 1333 msk->first = NULL;
dc24f8b4 1334 inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
8ab183de 1335
1b1c7a0e
PK
1336 mptcp_pm_data_init(msk);
1337
b51f9b80
PA
1338 /* re-use the csk retrans timer for MPTCP-level retrans */
1339 timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
1340
f870fa0b
MM
1341 return 0;
1342}
1343
784325e9
MB
1344static int mptcp_init_sock(struct sock *sk)
1345{
fc518953
FW
1346 struct net *net = sock_net(sk);
1347 int ret;
18b683bf 1348
fc518953
FW
1349 if (!mptcp_is_enabled(net))
1350 return -ENOPROTOOPT;
1351
1352 if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net))
1353 return -ENOMEM;
1354
1355 ret = __mptcp_init_sock(sk);
18b683bf
PA
1356 if (ret)
1357 return ret;
1358
fa68018d
PA
1359 ret = __mptcp_socket_create(mptcp_sk(sk));
1360 if (ret)
1361 return ret;
1362
d027236c 1363 sk_sockets_allocated_inc(sk);
a6b118fe 1364 sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
3f8e0aae 1365 sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2];
d027236c 1366
18b683bf
PA
1367 return 0;
1368}
1369
1370static void __mptcp_clear_xmit(struct sock *sk)
1371{
1372 struct mptcp_sock *msk = mptcp_sk(sk);
1373 struct mptcp_data_frag *dtmp, *dfrag;
1374
b51f9b80
PA
1375 sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
1376
18b683bf 1377 list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list)
d027236c 1378 dfrag_clear(sk, dfrag);
784325e9
MB
1379}
1380
80992017
PA
1381static void mptcp_cancel_work(struct sock *sk)
1382{
1383 struct mptcp_sock *msk = mptcp_sk(sk);
1384
1385 if (cancel_work_sync(&msk->work))
1386 sock_put(sk);
1387}
1388
76c42a29
MM
1389static void mptcp_subflow_shutdown(struct sock *ssk, int how,
1390 bool data_fin_tx_enable, u64 data_fin_tx_seq)
21498490
PK
1391{
1392 lock_sock(ssk);
1393
1394 switch (ssk->sk_state) {
1395 case TCP_LISTEN:
1396 if (!(how & RCV_SHUTDOWN))
1397 break;
1398 /* fall through */
1399 case TCP_SYN_SENT:
1400 tcp_disconnect(ssk, O_NONBLOCK);
1401 break;
1402 default:
76c42a29
MM
1403 if (data_fin_tx_enable) {
1404 struct mptcp_subflow_context *subflow;
1405
1406 subflow = mptcp_subflow_ctx(ssk);
1407 subflow->data_fin_tx_seq = data_fin_tx_seq;
1408 subflow->data_fin_tx_enable = 1;
1409 }
1410
21498490
PK
1411 ssk->sk_shutdown |= how;
1412 tcp_shutdown(ssk, how);
1413 break;
1414 }
1415
21498490
PK
1416 release_sock(ssk);
1417}
1418
8ab183de 1419/* Called with msk lock held, releases such lock before returning */
2c22c06c 1420static void mptcp_close(struct sock *sk, long timeout)
f870fa0b 1421{
cec37a6e 1422 struct mptcp_subflow_context *subflow, *tmp;
f870fa0b 1423 struct mptcp_sock *msk = mptcp_sk(sk);
b2c5b614 1424 LIST_HEAD(conn_list);
76c42a29 1425 u64 data_fin_tx_seq;
f870fa0b 1426
2c22c06c
FW
1427 lock_sock(sk);
1428
f870fa0b
MM
1429 inet_sk_state_store(sk, TCP_CLOSE);
1430
10f6d46c
PA
1431 /* be sure to always acquire the join list lock, to sync vs
1432 * mptcp_finish_join().
1433 */
1434 spin_lock_bh(&msk->join_list_lock);
1435 list_splice_tail_init(&msk->join_list, &msk->conn_list);
1436 spin_unlock_bh(&msk->join_list_lock);
b2c5b614
FW
1437 list_splice_init(&msk->conn_list, &conn_list);
1438
76c42a29
MM
1439 data_fin_tx_seq = msk->write_seq;
1440
18b683bf
PA
1441 __mptcp_clear_xmit(sk);
1442
b2c5b614
FW
1443 release_sock(sk);
1444
1445 list_for_each_entry_safe(subflow, tmp, &conn_list, node) {
cec37a6e
PK
1446 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1447
76c42a29
MM
1448 subflow->data_fin_tx_seq = data_fin_tx_seq;
1449 subflow->data_fin_tx_enable = 1;
cec37a6e 1450 __mptcp_close_ssk(sk, ssk, subflow, timeout);
f870fa0b
MM
1451 }
1452
80992017
PA
1453 mptcp_cancel_work(sk);
1454
6771bfd9
FW
1455 __skb_queue_purge(&sk->sk_receive_queue);
1456
cec37a6e 1457 sk_common_release(sk);
f870fa0b
MM
1458}
1459
cf7da0d6
PK
1460static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
1461{
1462#if IS_ENABLED(CONFIG_MPTCP_IPV6)
1463 const struct ipv6_pinfo *ssk6 = inet6_sk(ssk);
1464 struct ipv6_pinfo *msk6 = inet6_sk(msk);
1465
1466 msk->sk_v6_daddr = ssk->sk_v6_daddr;
1467 msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr;
1468
1469 if (msk6 && ssk6) {
1470 msk6->saddr = ssk6->saddr;
1471 msk6->flow_label = ssk6->flow_label;
1472 }
1473#endif
1474
1475 inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num;
1476 inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport;
1477 inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport;
1478 inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr;
1479 inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr;
1480 inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr;
1481}
1482
18b683bf
PA
1483static int mptcp_disconnect(struct sock *sk, int flags)
1484{
42c556fe
FW
1485 /* Should never be called.
1486 * inet_stream_connect() calls ->disconnect, but that
1487 * refers to the subflow socket, not the mptcp one.
1488 */
1489 WARN_ON_ONCE(1);
1490 return 0;
18b683bf
PA
1491}
1492
b0519de8
FW
1493#if IS_ENABLED(CONFIG_MPTCP_IPV6)
1494static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)
1495{
1496 unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo);
1497
1498 return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
1499}
1500#endif
1501
fca5c82c 1502struct sock *mptcp_sk_clone(const struct sock *sk,
cfde141e 1503 const struct mptcp_options_received *mp_opt,
fca5c82c 1504 struct request_sock *req)
b0519de8 1505{
58b09919 1506 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
b0519de8 1507 struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC);
58b09919
PA
1508 struct mptcp_sock *msk;
1509 u64 ack_seq;
b0519de8
FW
1510
1511 if (!nsk)
1512 return NULL;
1513
1514#if IS_ENABLED(CONFIG_MPTCP_IPV6)
1515 if (nsk->sk_family == AF_INET6)
1516 inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk);
1517#endif
1518
58b09919
PA
1519 __mptcp_init_sock(nsk);
1520
1521 msk = mptcp_sk(nsk);
1522 msk->local_key = subflow_req->local_key;
1523 msk->token = subflow_req->token;
1524 msk->subflow = NULL;
b93df08c 1525 WRITE_ONCE(msk->fully_established, false);
58b09919 1526
58b09919 1527 msk->write_seq = subflow_req->idsn + 1;
cc9d2566 1528 atomic64_set(&msk->snd_una, msk->write_seq);
cfde141e 1529 if (mp_opt->mp_capable) {
58b09919 1530 msk->can_ack = true;
cfde141e 1531 msk->remote_key = mp_opt->sndr_key;
58b09919
PA
1532 mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
1533 ack_seq++;
1534 msk->ack_seq = ack_seq;
1535 }
7f20d5fc 1536
5e20087d 1537 sock_reset_flag(nsk, SOCK_RCU_FREE);
7f20d5fc
PA
1538 /* will be fully established after successful MPC subflow creation */
1539 inet_sk_state_store(nsk, TCP_SYN_RECV);
58b09919
PA
1540 bh_unlock_sock(nsk);
1541
1542 /* keep a single reference */
1543 __sock_put(nsk);
b0519de8
FW
1544 return nsk;
1545}
1546
a6b118fe
FW
1547void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
1548{
1549 const struct tcp_sock *tp = tcp_sk(ssk);
1550
1551 msk->rcvq_space.copied = 0;
1552 msk->rcvq_space.rtt_us = 0;
1553
1554 msk->rcvq_space.time = tp->tcp_mstamp;
1555
1556 /* initial rcv_space offering made to peer */
1557 msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
1558 TCP_INIT_CWND * tp->advmss);
1559 if (msk->rcvq_space.space == 0)
1560 msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
1561}
1562
cf7da0d6
PK
1563static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
1564 bool kern)
1565{
1566 struct mptcp_sock *msk = mptcp_sk(sk);
1567 struct socket *listener;
1568 struct sock *newsk;
1569
1570 listener = __mptcp_nmpc_socket(msk);
1571 if (WARN_ON_ONCE(!listener)) {
1572 *err = -EINVAL;
1573 return NULL;
1574 }
1575
1576 pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk));
1577 newsk = inet_csk_accept(listener->sk, flags, err, kern);
1578 if (!newsk)
1579 return NULL;
1580
1581 pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk));
cf7da0d6
PK
1582 if (sk_is_mptcp(newsk)) {
1583 struct mptcp_subflow_context *subflow;
1584 struct sock *new_mptcp_sock;
1585 struct sock *ssk = newsk;
1586
1587 subflow = mptcp_subflow_ctx(newsk);
58b09919 1588 new_mptcp_sock = subflow->conn;
cf7da0d6 1589
58b09919
PA
1590 /* is_mptcp should be false if subflow->conn is missing, see
1591 * subflow_syn_recv_sock()
1592 */
1593 if (WARN_ON_ONCE(!new_mptcp_sock)) {
1594 tcp_sk(newsk)->is_mptcp = 0;
1595 return newsk;
cf7da0d6
PK
1596 }
1597
58b09919
PA
1598 /* acquire the 2nd reference for the owning socket */
1599 sock_hold(new_mptcp_sock);
cf7da0d6 1600
58b09919
PA
1601 local_bh_disable();
1602 bh_lock_sock(new_mptcp_sock);
cf7da0d6 1603 msk = mptcp_sk(new_mptcp_sock);
8ab183de 1604 msk->first = newsk;
cf7da0d6
PK
1605
1606 newsk = new_mptcp_sock;
1607 mptcp_copy_inaddrs(newsk, ssk);
1608 list_add(&subflow->node, &msk->conn_list);
1609
a6b118fe 1610 mptcp_rcv_space_init(msk, ssk);
cf7da0d6 1611 bh_unlock_sock(new_mptcp_sock);
fc518953
FW
1612
1613 __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
cf7da0d6 1614 local_bh_enable();
fc518953
FW
1615 } else {
1616 MPTCP_INC_STATS(sock_net(sk),
1617 MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
cf7da0d6
PK
1618 }
1619
1620 return newsk;
1621}
1622
79c0949e
PK
1623static void mptcp_destroy(struct sock *sk)
1624{
c9fd9c5f
FW
1625 struct mptcp_sock *msk = mptcp_sk(sk);
1626
2c5ebd00 1627 mptcp_token_destroy(msk);
c9fd9c5f
FW
1628 if (msk->cached_ext)
1629 __skb_ext_put(msk->cached_ext);
d027236c
PA
1630
1631 sk_sockets_allocated_dec(sk);
79c0949e
PK
1632}
1633
fd1452d8
FW
1634static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
1635 char __user *optval, unsigned int optlen)
1636{
1637 struct sock *sk = (struct sock *)msk;
1638 struct socket *ssock;
1639 int ret;
1640
1641 switch (optname) {
1642 case SO_REUSEPORT:
1643 case SO_REUSEADDR:
1644 lock_sock(sk);
1645 ssock = __mptcp_nmpc_socket(msk);
1646 if (!ssock) {
1647 release_sock(sk);
1648 return -EINVAL;
1649 }
1650
c8c1bbb6
CH
1651 ret = sock_setsockopt(ssock, SOL_SOCKET, optname,
1652 USER_SOCKPTR(optval), optlen);
fd1452d8
FW
1653 if (ret == 0) {
1654 if (optname == SO_REUSEPORT)
1655 sk->sk_reuseport = ssock->sk->sk_reuseport;
1656 else if (optname == SO_REUSEADDR)
1657 sk->sk_reuse = ssock->sk->sk_reuse;
1658 }
1659 release_sock(sk);
1660 return ret;
1661 }
1662
c8c1bbb6
CH
1663 return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname,
1664 USER_SOCKPTR(optval), optlen);
fd1452d8
FW
1665}
1666
c9b95a13
FW
1667static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
1668 char __user *optval, unsigned int optlen)
1669{
1670 struct sock *sk = (struct sock *)msk;
1671 int ret = -EOPNOTSUPP;
1672 struct socket *ssock;
1673
1674 switch (optname) {
1675 case IPV6_V6ONLY:
1676 lock_sock(sk);
1677 ssock = __mptcp_nmpc_socket(msk);
1678 if (!ssock) {
1679 release_sock(sk);
1680 return -EINVAL;
1681 }
1682
1683 ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen);
1684 if (ret == 0)
1685 sk->sk_ipv6only = ssock->sk->sk_ipv6only;
1686
1687 release_sock(sk);
1688 break;
1689 }
1690
1691 return ret;
1692}
1693
717e79c8 1694static int mptcp_setsockopt(struct sock *sk, int level, int optname,
50e741bb 1695 char __user *optval, unsigned int optlen)
717e79c8
PK
1696{
1697 struct mptcp_sock *msk = mptcp_sk(sk);
76660afb 1698 struct sock *ssk;
717e79c8
PK
1699
1700 pr_debug("msk=%p", msk);
1701
83f0c10b 1702 if (level == SOL_SOCKET)
fd1452d8 1703 return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen);
83f0c10b 1704
717e79c8 1705 /* @@ the meaning of setsockopt() when the socket is connected and
b6e4a1ae
MM
1706 * there are multiple subflows is not yet defined. It is up to the
1707 * MPTCP-level socket to configure the subflows until the subflow
1708 * is in TCP fallback, when TCP socket options are passed through
1709 * to the one remaining subflow.
717e79c8
PK
1710 */
1711 lock_sock(sk);
76660afb 1712 ssk = __mptcp_tcp_fallback(msk);
e154659b 1713 release_sock(sk);
76660afb
PA
1714 if (ssk)
1715 return tcp_setsockopt(ssk, level, optname, optval, optlen);
50e741bb 1716
c9b95a13
FW
1717 if (level == SOL_IPV6)
1718 return mptcp_setsockopt_v6(msk, optname, optval, optlen);
1719
b6e4a1ae 1720 return -EOPNOTSUPP;
717e79c8
PK
1721}
1722
1723static int mptcp_getsockopt(struct sock *sk, int level, int optname,
50e741bb 1724 char __user *optval, int __user *option)
717e79c8
PK
1725{
1726 struct mptcp_sock *msk = mptcp_sk(sk);
76660afb 1727 struct sock *ssk;
717e79c8
PK
1728
1729 pr_debug("msk=%p", msk);
1730
b6e4a1ae
MM
1731 /* @@ the meaning of setsockopt() when the socket is connected and
1732 * there are multiple subflows is not yet defined. It is up to the
1733 * MPTCP-level socket to configure the subflows until the subflow
1734 * is in TCP fallback, when socket options are passed through
1735 * to the one remaining subflow.
717e79c8
PK
1736 */
1737 lock_sock(sk);
76660afb 1738 ssk = __mptcp_tcp_fallback(msk);
e154659b 1739 release_sock(sk);
76660afb
PA
1740 if (ssk)
1741 return tcp_getsockopt(ssk, level, optname, optval, option);
50e741bb 1742
b6e4a1ae 1743 return -EOPNOTSUPP;
717e79c8
PK
1744}
1745
b51f9b80
PA
1746#define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \
1747 TCPF_WRITE_TIMER_DEFERRED)
14c441b5
PA
1748
1749/* this is very alike tcp_release_cb() but we must handle differently a
1750 * different set of events
1751 */
1752static void mptcp_release_cb(struct sock *sk)
1753{
1754 unsigned long flags, nflags;
1755
1756 do {
1757 flags = sk->sk_tsq_flags;
1758 if (!(flags & MPTCP_DEFERRED_ALL))
1759 return;
1760 nflags = flags & ~MPTCP_DEFERRED_ALL;
1761 } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
1762
b51f9b80
PA
1763 sock_release_ownership(sk);
1764
14c441b5
PA
1765 if (flags & TCPF_DELACK_TIMER_DEFERRED) {
1766 struct mptcp_sock *msk = mptcp_sk(sk);
1767 struct sock *ssk;
1768
1769 ssk = mptcp_subflow_recv_lookup(msk);
1770 if (!ssk || !schedule_work(&msk->work))
1771 __sock_put(sk);
1772 }
b51f9b80
PA
1773
1774 if (flags & TCPF_WRITE_TIMER_DEFERRED) {
1775 mptcp_retransmit_handler(sk);
1776 __sock_put(sk);
1777 }
14c441b5
PA
1778}
1779
2c5ebd00
PA
1780static int mptcp_hash(struct sock *sk)
1781{
1782 /* should never be called,
1783 * we hash the TCP subflows not the master socket
1784 */
1785 WARN_ON_ONCE(1);
1786 return 0;
1787}
1788
1789static void mptcp_unhash(struct sock *sk)
1790{
1791 /* called from sk_common_release(), but nothing to do here */
1792}
1793
cec37a6e 1794static int mptcp_get_port(struct sock *sk, unsigned short snum)
f870fa0b
MM
1795{
1796 struct mptcp_sock *msk = mptcp_sk(sk);
cec37a6e 1797 struct socket *ssock;
f870fa0b 1798
cec37a6e
PK
1799 ssock = __mptcp_nmpc_socket(msk);
1800 pr_debug("msk=%p, subflow=%p", msk, ssock);
1801 if (WARN_ON_ONCE(!ssock))
1802 return -EINVAL;
f870fa0b 1803
cec37a6e
PK
1804 return inet_csk_get_port(ssock->sk, snum);
1805}
f870fa0b 1806
cec37a6e
PK
1807void mptcp_finish_connect(struct sock *ssk)
1808{
1809 struct mptcp_subflow_context *subflow;
1810 struct mptcp_sock *msk;
1811 struct sock *sk;
6d0060f6 1812 u64 ack_seq;
f870fa0b 1813
cec37a6e 1814 subflow = mptcp_subflow_ctx(ssk);
cec37a6e
PK
1815 sk = subflow->conn;
1816 msk = mptcp_sk(sk);
1817
648ef4b8
MM
1818 pr_debug("msk=%p, token=%u", sk, subflow->token);
1819
6d0060f6
MM
1820 mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
1821 ack_seq++;
648ef4b8
MM
1822 subflow->map_seq = ack_seq;
1823 subflow->map_subflow_seq = 1;
6d0060f6 1824
cec37a6e
PK
1825 /* the socket is not connected yet, no msk/subflow ops can access/race
1826 * accessing the field below
1827 */
1828 WRITE_ONCE(msk->remote_key, subflow->remote_key);
1829 WRITE_ONCE(msk->local_key, subflow->local_key);
6d0060f6
MM
1830 WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
1831 WRITE_ONCE(msk->ack_seq, ack_seq);
d22f4988 1832 WRITE_ONCE(msk->can_ack, 1);
cc9d2566 1833 atomic64_set(&msk->snd_una, msk->write_seq);
1b1c7a0e
PK
1834
1835 mptcp_pm_new_connection(msk, 0);
a6b118fe
FW
1836
1837 mptcp_rcv_space_init(msk, ssk);
f870fa0b
MM
1838}
1839
cf7da0d6
PK
1840static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
1841{
1842 write_lock_bh(&sk->sk_callback_lock);
1843 rcu_assign_pointer(sk->sk_wq, &parent->wq);
1844 sk_set_socket(sk, parent);
1845 sk->sk_uid = SOCK_INODE(parent)->i_uid;
1846 write_unlock_bh(&sk->sk_callback_lock);
1847}
1848
f296234c
PK
1849bool mptcp_finish_join(struct sock *sk)
1850{
1851 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
1852 struct mptcp_sock *msk = mptcp_sk(subflow->conn);
1853 struct sock *parent = (void *)msk;
1854 struct socket *parent_sock;
ec3edaa7 1855 bool ret;
f296234c
PK
1856
1857 pr_debug("msk=%p, subflow=%p", msk, subflow);
1858
1859 /* mptcp socket already closing? */
b93df08c 1860 if (!mptcp_is_fully_established(parent))
f296234c
PK
1861 return false;
1862
1863 if (!msk->pm.server_side)
1864 return true;
1865
10f6d46c
PA
1866 if (!mptcp_pm_allow_new_subflow(msk))
1867 return false;
1868
1869 /* active connections are already on conn_list, and we can't acquire
1870 * msk lock here.
1871 * use the join list lock as synchronization point and double-check
1872 * msk status to avoid racing with mptcp_close()
1873 */
1874 spin_lock_bh(&msk->join_list_lock);
1875 ret = inet_sk_state_load(parent) == TCP_ESTABLISHED;
1876 if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node)))
1877 list_add_tail(&subflow->node, &msk->join_list);
1878 spin_unlock_bh(&msk->join_list_lock);
1879 if (!ret)
1880 return false;
1881
1882 /* attach to msk socket only after we are sure he will deal with us
1883 * at close time
1884 */
f296234c
PK
1885 parent_sock = READ_ONCE(parent->sk_socket);
1886 if (parent_sock && !sk->sk_socket)
1887 mptcp_sock_graft(sk, parent_sock);
10f6d46c
PA
1888 subflow->map_seq = msk->ack_seq;
1889 return true;
f296234c
PK
1890}
1891
1891c4a0
FW
1892static bool mptcp_memory_free(const struct sock *sk, int wake)
1893{
1894 struct mptcp_sock *msk = mptcp_sk(sk);
1895
1896 return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true;
1897}
1898
f870fa0b
MM
1899static struct proto mptcp_prot = {
1900 .name = "MPTCP",
1901 .owner = THIS_MODULE,
1902 .init = mptcp_init_sock,
18b683bf 1903 .disconnect = mptcp_disconnect,
f870fa0b 1904 .close = mptcp_close,
cf7da0d6 1905 .accept = mptcp_accept,
717e79c8
PK
1906 .setsockopt = mptcp_setsockopt,
1907 .getsockopt = mptcp_getsockopt,
f870fa0b 1908 .shutdown = tcp_shutdown,
79c0949e 1909 .destroy = mptcp_destroy,
f870fa0b
MM
1910 .sendmsg = mptcp_sendmsg,
1911 .recvmsg = mptcp_recvmsg,
14c441b5 1912 .release_cb = mptcp_release_cb,
2c5ebd00
PA
1913 .hash = mptcp_hash,
1914 .unhash = mptcp_unhash,
cec37a6e 1915 .get_port = mptcp_get_port,
d027236c
PA
1916 .sockets_allocated = &mptcp_sockets_allocated,
1917 .memory_allocated = &tcp_memory_allocated,
1918 .memory_pressure = &tcp_memory_pressure,
1891c4a0 1919 .stream_memory_free = mptcp_memory_free,
d027236c
PA
1920 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
1921 .sysctl_mem = sysctl_tcp_mem,
f870fa0b 1922 .obj_size = sizeof(struct mptcp_sock),
2c5ebd00 1923 .slab_flags = SLAB_TYPESAFE_BY_RCU,
f870fa0b
MM
1924 .no_autobind = true,
1925};
1926
2303f994
PK
1927static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1928{
1929 struct mptcp_sock *msk = mptcp_sk(sock->sk);
1930 struct socket *ssock;
cf7da0d6 1931 int err;
2303f994
PK
1932
1933 lock_sock(sock->sk);
fa68018d
PA
1934 ssock = __mptcp_nmpc_socket(msk);
1935 if (!ssock) {
1936 err = -EINVAL;
2303f994
PK
1937 goto unlock;
1938 }
1939
1940 err = ssock->ops->bind(ssock, uaddr, addr_len);
cf7da0d6
PK
1941 if (!err)
1942 mptcp_copy_inaddrs(sock->sk, ssock->sk);
2303f994
PK
1943
1944unlock:
1945 release_sock(sock->sk);
1946 return err;
1947}
1948
0235d075
PA
1949static void mptcp_subflow_early_fallback(struct mptcp_sock *msk,
1950 struct mptcp_subflow_context *subflow)
1951{
1952 subflow->request_mptcp = 0;
1953 __mptcp_do_fallback(msk);
1954}
1955
2303f994
PK
1956static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1957 int addr_len, int flags)
1958{
1959 struct mptcp_sock *msk = mptcp_sk(sock->sk);
2c5ebd00 1960 struct mptcp_subflow_context *subflow;
2303f994
PK
1961 struct socket *ssock;
1962 int err;
1963
1964 lock_sock(sock->sk);
41be81a8
PA
1965 if (sock->state != SS_UNCONNECTED && msk->subflow) {
1966 /* pending connection or invalid state, let existing subflow
1967 * cope with that
1968 */
1969 ssock = msk->subflow;
1970 goto do_connect;
1971 }
1972
fa68018d
PA
1973 ssock = __mptcp_nmpc_socket(msk);
1974 if (!ssock) {
1975 err = -EINVAL;
2303f994
PK
1976 goto unlock;
1977 }
1978
fa68018d
PA
1979 mptcp_token_destroy(msk);
1980 inet_sk_state_store(sock->sk, TCP_SYN_SENT);
2c5ebd00 1981 subflow = mptcp_subflow_ctx(ssock->sk);
cf7da0d6
PK
1982#ifdef CONFIG_TCP_MD5SIG
1983 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
1984 * TCP option space.
1985 */
1986 if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info))
0235d075 1987 mptcp_subflow_early_fallback(msk, subflow);
cf7da0d6 1988#endif
2c5ebd00 1989 if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk))
0235d075 1990 mptcp_subflow_early_fallback(msk, subflow);
cf7da0d6 1991
41be81a8 1992do_connect:
2303f994 1993 err = ssock->ops->connect(ssock, uaddr, addr_len, flags);
41be81a8
PA
1994 sock->state = ssock->state;
1995
1996 /* on successful connect, the msk state will be moved to established by
1997 * subflow_finish_connect()
1998 */
1999 if (!err || err == EINPROGRESS)
2000 mptcp_copy_inaddrs(sock->sk, ssock->sk);
2001 else
2002 inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
2303f994
PK
2003
2004unlock:
2005 release_sock(sock->sk);
2006 return err;
2007}
2008
cf7da0d6
PK
2009static int mptcp_listen(struct socket *sock, int backlog)
2010{
2011 struct mptcp_sock *msk = mptcp_sk(sock->sk);
2012 struct socket *ssock;
2013 int err;
2014
2015 pr_debug("msk=%p", msk);
2016
2017 lock_sock(sock->sk);
fa68018d
PA
2018 ssock = __mptcp_nmpc_socket(msk);
2019 if (!ssock) {
2020 err = -EINVAL;
cf7da0d6
PK
2021 goto unlock;
2022 }
2023
fa68018d
PA
2024 mptcp_token_destroy(msk);
2025 inet_sk_state_store(sock->sk, TCP_LISTEN);
5e20087d
FW
2026 sock_set_flag(sock->sk, SOCK_RCU_FREE);
2027
cf7da0d6
PK
2028 err = ssock->ops->listen(ssock, backlog);
2029 inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
2030 if (!err)
2031 mptcp_copy_inaddrs(sock->sk, ssock->sk);
2032
2033unlock:
2034 release_sock(sock->sk);
2035 return err;
2036}
2037
cf7da0d6
PK
2038static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
2039 int flags, bool kern)
2040{
2041 struct mptcp_sock *msk = mptcp_sk(sock->sk);
2042 struct socket *ssock;
2043 int err;
2044
2045 pr_debug("msk=%p", msk);
2046
2047 lock_sock(sock->sk);
2048 if (sock->sk->sk_state != TCP_LISTEN)
2049 goto unlock_fail;
2050
2051 ssock = __mptcp_nmpc_socket(msk);
2052 if (!ssock)
2053 goto unlock_fail;
2054
8a05661b 2055 clear_bit(MPTCP_DATA_READY, &msk->flags);
cf7da0d6
PK
2056 sock_hold(ssock->sk);
2057 release_sock(sock->sk);
2058
2059 err = ssock->ops->accept(sock, newsock, flags, kern);
d2f77c53 2060 if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) {
cf7da0d6
PK
2061 struct mptcp_sock *msk = mptcp_sk(newsock->sk);
2062 struct mptcp_subflow_context *subflow;
2063
2064 /* set ssk->sk_socket of accept()ed flows to mptcp socket.
2065 * This is needed so NOSPACE flag can be set from tcp stack.
2066 */
ec3edaa7 2067 __mptcp_flush_join_list(msk);
cf7da0d6
PK
2068 list_for_each_entry(subflow, &msk->conn_list, node) {
2069 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
2070
2071 if (!ssk->sk_socket)
2072 mptcp_sock_graft(ssk, newsock);
2073 }
cf7da0d6
PK
2074 }
2075
8a05661b
PA
2076 if (inet_csk_listen_poll(ssock->sk))
2077 set_bit(MPTCP_DATA_READY, &msk->flags);
cf7da0d6
PK
2078 sock_put(ssock->sk);
2079 return err;
2080
2081unlock_fail:
2082 release_sock(sock->sk);
2083 return -EINVAL;
2084}
2085
8a05661b
PA
2086static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
2087{
2088 return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM :
2089 0;
2090}
2091
2303f994
PK
2092static __poll_t mptcp_poll(struct file *file, struct socket *sock,
2093 struct poll_table_struct *wait)
2094{
1891c4a0 2095 struct sock *sk = sock->sk;
8ab183de 2096 struct mptcp_sock *msk;
2303f994 2097 __poll_t mask = 0;
8a05661b 2098 int state;
2303f994 2099
1891c4a0 2100 msk = mptcp_sk(sk);
1891c4a0 2101 sock_poll_wait(file, sock, wait);
1891c4a0 2102
8a05661b
PA
2103 state = inet_sk_state_load(sk);
2104 if (state == TCP_LISTEN)
2105 return mptcp_check_readable(msk);
2106
2107 if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
2108 mask |= mptcp_check_readable(msk);
2109 if (sk_stream_is_writeable(sk) &&
2110 test_bit(MPTCP_SEND_SPACE, &msk->flags))
2111 mask |= EPOLLOUT | EPOLLWRNORM;
2112 }
1891c4a0
FW
2113 if (sk->sk_shutdown & RCV_SHUTDOWN)
2114 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
2115
2303f994
PK
2116 return mask;
2117}
2118
21498490
PK
2119static int mptcp_shutdown(struct socket *sock, int how)
2120{
2121 struct mptcp_sock *msk = mptcp_sk(sock->sk);
2122 struct mptcp_subflow_context *subflow;
2123 int ret = 0;
2124
2125 pr_debug("sk=%p, how=%d", msk, how);
2126
2127 lock_sock(sock->sk);
21498490
PK
2128 if (how == SHUT_WR || how == SHUT_RDWR)
2129 inet_sk_state_store(sock->sk, TCP_FIN_WAIT1);
2130
2131 how++;
2132
2133 if ((how & ~SHUTDOWN_MASK) || !how) {
2134 ret = -EINVAL;
2135 goto out_unlock;
2136 }
2137
2138 if (sock->state == SS_CONNECTING) {
2139 if ((1 << sock->sk->sk_state) &
2140 (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
2141 sock->state = SS_DISCONNECTING;
2142 else
2143 sock->state = SS_CONNECTED;
2144 }
2145
ec3edaa7 2146 __mptcp_flush_join_list(msk);
21498490
PK
2147 mptcp_for_each_subflow(msk, subflow) {
2148 struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
2149
76c42a29 2150 mptcp_subflow_shutdown(tcp_sk, how, 1, msk->write_seq);
21498490
PK
2151 }
2152
e1ff9e82
DC
2153 /* Wake up anyone sleeping in poll. */
2154 sock->sk->sk_state_change(sock->sk);
2155
21498490
PK
2156out_unlock:
2157 release_sock(sock->sk);
2158
2159 return ret;
2160}
2161
e42f1ac6
FW
2162static const struct proto_ops mptcp_stream_ops = {
2163 .family = PF_INET,
2164 .owner = THIS_MODULE,
2165 .release = inet_release,
2166 .bind = mptcp_bind,
2167 .connect = mptcp_stream_connect,
2168 .socketpair = sock_no_socketpair,
2169 .accept = mptcp_stream_accept,
d2f77c53 2170 .getname = inet_getname,
e42f1ac6
FW
2171 .poll = mptcp_poll,
2172 .ioctl = inet_ioctl,
2173 .gettstamp = sock_gettstamp,
2174 .listen = mptcp_listen,
2175 .shutdown = mptcp_shutdown,
2176 .setsockopt = sock_common_setsockopt,
2177 .getsockopt = sock_common_getsockopt,
2178 .sendmsg = inet_sendmsg,
2179 .recvmsg = inet_recvmsg,
2180 .mmap = sock_no_mmap,
2181 .sendpage = inet_sendpage,
e42f1ac6 2182};
2303f994 2183
f870fa0b
MM
2184static struct inet_protosw mptcp_protosw = {
2185 .type = SOCK_STREAM,
2186 .protocol = IPPROTO_MPTCP,
2187 .prot = &mptcp_prot,
2303f994
PK
2188 .ops = &mptcp_stream_ops,
2189 .flags = INET_PROTOSW_ICSK,
f870fa0b
MM
2190};
2191
d39dceca 2192void __init mptcp_proto_init(void)
f870fa0b 2193{
2303f994 2194 mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;
2303f994 2195
d027236c
PA
2196 if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL))
2197 panic("Failed to allocate MPTCP pcpu counter\n");
2198
2303f994 2199 mptcp_subflow_init();
1b1c7a0e 2200 mptcp_pm_init();
2c5ebd00 2201 mptcp_token_init();
2303f994 2202
f870fa0b
MM
2203 if (proto_register(&mptcp_prot, 1) != 0)
2204 panic("Failed to register MPTCP proto.\n");
2205
2206 inet_register_protosw(&mptcp_protosw);
6771bfd9
FW
2207
2208 BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb));
f870fa0b
MM
2209}
2210
2211#if IS_ENABLED(CONFIG_MPTCP_IPV6)
e42f1ac6
FW
2212static const struct proto_ops mptcp_v6_stream_ops = {
2213 .family = PF_INET6,
2214 .owner = THIS_MODULE,
2215 .release = inet6_release,
2216 .bind = mptcp_bind,
2217 .connect = mptcp_stream_connect,
2218 .socketpair = sock_no_socketpair,
2219 .accept = mptcp_stream_accept,
d2f77c53 2220 .getname = inet6_getname,
e42f1ac6
FW
2221 .poll = mptcp_poll,
2222 .ioctl = inet6_ioctl,
2223 .gettstamp = sock_gettstamp,
2224 .listen = mptcp_listen,
2225 .shutdown = mptcp_shutdown,
2226 .setsockopt = sock_common_setsockopt,
2227 .getsockopt = sock_common_getsockopt,
2228 .sendmsg = inet6_sendmsg,
2229 .recvmsg = inet6_recvmsg,
2230 .mmap = sock_no_mmap,
2231 .sendpage = inet_sendpage,
2232#ifdef CONFIG_COMPAT
3986912f 2233 .compat_ioctl = inet6_compat_ioctl,
e42f1ac6
FW
2234#endif
2235};
2236
f870fa0b
MM
2237static struct proto mptcp_v6_prot;
2238
79c0949e
PK
2239static void mptcp_v6_destroy(struct sock *sk)
2240{
2241 mptcp_destroy(sk);
2242 inet6_destroy_sock(sk);
2243}
2244
f870fa0b
MM
2245static struct inet_protosw mptcp_v6_protosw = {
2246 .type = SOCK_STREAM,
2247 .protocol = IPPROTO_MPTCP,
2248 .prot = &mptcp_v6_prot,
2303f994 2249 .ops = &mptcp_v6_stream_ops,
f870fa0b
MM
2250 .flags = INET_PROTOSW_ICSK,
2251};
2252
d39dceca 2253int __init mptcp_proto_v6_init(void)
f870fa0b
MM
2254{
2255 int err;
2256
2257 mptcp_v6_prot = mptcp_prot;
2258 strcpy(mptcp_v6_prot.name, "MPTCPv6");
2259 mptcp_v6_prot.slab = NULL;
79c0949e 2260 mptcp_v6_prot.destroy = mptcp_v6_destroy;
b0519de8 2261 mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock);
f870fa0b
MM
2262
2263 err = proto_register(&mptcp_v6_prot, 1);
2264 if (err)
2265 return err;
2266
2267 err = inet6_register_protosw(&mptcp_v6_protosw);
2268 if (err)
2269 proto_unregister(&mptcp_v6_prot);
2270
2271 return err;
2272}
2273#endif