mptcp: allow dumping subflow context to userspace
[linux-2.6-block.git] / net / mptcp / subflow.c
CommitLineData
2303f994
PK
1// SPDX-License-Identifier: GPL-2.0
2/* Multipath TCP
3 *
4 * Copyright (c) 2017 - 2019, Intel Corporation.
5 */
6
79c0949e
PK
7#define pr_fmt(fmt) "MPTCP: " fmt
8
2303f994
PK
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/netdevice.h>
f296234c 12#include <crypto/algapi.h>
2303f994
PK
13#include <net/sock.h>
14#include <net/inet_common.h>
15#include <net/inet_hashtables.h>
16#include <net/protocol.h>
17#include <net/tcp.h>
cec37a6e
PK
18#if IS_ENABLED(CONFIG_MPTCP_IPV6)
19#include <net/ip6_route.h>
20#endif
2303f994
PK
21#include <net/mptcp.h>
22#include "protocol.h"
23
79c0949e
PK
24static int subflow_rebuild_header(struct sock *sk)
25{
26 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
ec3edaa7 27 int local_id, err = 0;
79c0949e
PK
28
29 if (subflow->request_mptcp && !subflow->token) {
30 pr_debug("subflow=%p", sk);
31 err = mptcp_token_new_connect(sk);
ec3edaa7
PK
32 } else if (subflow->request_join && !subflow->local_nonce) {
33 struct mptcp_sock *msk = (struct mptcp_sock *)subflow->conn;
34
35 pr_debug("subflow=%p", sk);
36
37 do {
38 get_random_bytes(&subflow->local_nonce, sizeof(u32));
39 } while (!subflow->local_nonce);
40
41 if (subflow->local_id)
42 goto out;
43
44 local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)sk);
45 if (local_id < 0)
46 return -EINVAL;
47
48 subflow->local_id = local_id;
79c0949e
PK
49 }
50
ec3edaa7 51out:
79c0949e
PK
52 if (err)
53 return err;
54
55 return subflow->icsk_af_ops->rebuild_header(sk);
56}
57
58static void subflow_req_destructor(struct request_sock *req)
59{
60 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
61
62 pr_debug("subflow_req=%p", subflow_req);
63
64 if (subflow_req->mp_capable)
65 mptcp_token_destroy_request(subflow_req->token);
66 tcp_request_sock_ops.destructor(req);
67}
68
f296234c
PK
69static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
70 void *hmac)
71{
72 u8 msg[8];
73
74 put_unaligned_be32(nonce1, &msg[0]);
75 put_unaligned_be32(nonce2, &msg[4]);
76
77 mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
78}
79
80/* validate received token and create truncated hmac and nonce for SYN-ACK */
81static bool subflow_token_join_request(struct request_sock *req,
82 const struct sk_buff *skb)
83{
84 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
85 u8 hmac[MPTCPOPT_HMAC_LEN];
86 struct mptcp_sock *msk;
87 int local_id;
88
89 msk = mptcp_token_get_sock(subflow_req->token);
90 if (!msk) {
91 pr_debug("subflow_req=%p, token=%u - not found\n",
92 subflow_req, subflow_req->token);
93 return false;
94 }
95
96 local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
97 if (local_id < 0) {
98 sock_put((struct sock *)msk);
99 return false;
100 }
101 subflow_req->local_id = local_id;
102
103 get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
104
105 subflow_generate_hmac(msk->local_key, msk->remote_key,
106 subflow_req->local_nonce,
107 subflow_req->remote_nonce, hmac);
108
109 subflow_req->thmac = get_unaligned_be64(hmac);
110
111 sock_put((struct sock *)msk);
112 return true;
113}
114
cec37a6e
PK
115static void subflow_init_req(struct request_sock *req,
116 const struct sock *sk_listener,
117 struct sk_buff *skb)
118{
119 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
120 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
121 struct tcp_options_received rx_opt;
122
123 pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
124
125 memset(&rx_opt.mptcp, 0, sizeof(rx_opt.mptcp));
126 mptcp_get_options(skb, &rx_opt);
127
128 subflow_req->mp_capable = 0;
f296234c 129 subflow_req->mp_join = 0;
d22f4988 130 subflow_req->remote_key_valid = 0;
cec37a6e
PK
131
132#ifdef CONFIG_TCP_MD5SIG
133 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
134 * TCP option space.
135 */
136 if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
137 return;
138#endif
139
f296234c
PK
140 if (rx_opt.mptcp.mp_capable && rx_opt.mptcp.mp_join)
141 return;
142
cec37a6e 143 if (rx_opt.mptcp.mp_capable && listener->request_mptcp) {
79c0949e
PK
144 int err;
145
146 err = mptcp_token_new_request(req);
147 if (err == 0)
148 subflow_req->mp_capable = 1;
149
648ef4b8 150 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
f296234c 151 } else if (rx_opt.mptcp.mp_join && listener->request_mptcp) {
ec3edaa7 152 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
f296234c
PK
153 subflow_req->mp_join = 1;
154 subflow_req->backup = rx_opt.mptcp.backup;
155 subflow_req->remote_id = rx_opt.mptcp.join_id;
156 subflow_req->token = rx_opt.mptcp.token;
157 subflow_req->remote_nonce = rx_opt.mptcp.nonce;
158 pr_debug("token=%u, remote_nonce=%u", subflow_req->token,
159 subflow_req->remote_nonce);
160 if (!subflow_token_join_request(req, skb)) {
161 subflow_req->mp_join = 0;
162 // @@ need to trigger RST
163 }
cec37a6e
PK
164 }
165}
166
167static void subflow_v4_init_req(struct request_sock *req,
168 const struct sock *sk_listener,
169 struct sk_buff *skb)
170{
171 tcp_rsk(req)->is_mptcp = 1;
172
173 tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb);
174
175 subflow_init_req(req, sk_listener, skb);
176}
177
178#if IS_ENABLED(CONFIG_MPTCP_IPV6)
179static void subflow_v6_init_req(struct request_sock *req,
180 const struct sock *sk_listener,
181 struct sk_buff *skb)
182{
183 tcp_rsk(req)->is_mptcp = 1;
184
185 tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb);
186
187 subflow_init_req(req, sk_listener, skb);
188}
189#endif
190
ec3edaa7
PK
191/* validate received truncated hmac and create hmac for third ACK */
192static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
193{
194 u8 hmac[MPTCPOPT_HMAC_LEN];
195 u64 thmac;
196
197 subflow_generate_hmac(subflow->remote_key, subflow->local_key,
198 subflow->remote_nonce, subflow->local_nonce,
199 hmac);
200
201 thmac = get_unaligned_be64(hmac);
202 pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
203 subflow, subflow->token,
204 (unsigned long long)thmac,
205 (unsigned long long)subflow->thmac);
206
207 return thmac == subflow->thmac;
208}
209
cec37a6e
PK
210static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
211{
212 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
c3c123d1 213 struct sock *parent = subflow->conn;
cec37a6e
PK
214
215 subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
216
c3c123d1
DC
217 if (inet_sk_state_load(parent) != TCP_ESTABLISHED) {
218 inet_sk_state_store(parent, TCP_ESTABLISHED);
219 parent->sk_state_change(parent);
220 }
221
ec3edaa7
PK
222 if (subflow->conn_finished || !tcp_sk(sk)->is_mptcp)
223 return;
224
225 if (subflow->mp_capable) {
cec37a6e
PK
226 pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
227 subflow->remote_key);
228 mptcp_finish_connect(sk);
229 subflow->conn_finished = 1;
648ef4b8
MM
230
231 if (skb) {
232 pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
233 subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
234 }
ec3edaa7
PK
235 } else if (subflow->mp_join) {
236 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u",
237 subflow, subflow->thmac,
238 subflow->remote_nonce);
239 if (!subflow_thmac_valid(subflow)) {
240 subflow->mp_join = 0;
241 goto do_reset;
242 }
243
244 subflow_generate_hmac(subflow->local_key, subflow->remote_key,
245 subflow->local_nonce,
246 subflow->remote_nonce,
247 subflow->hmac);
248
249 if (skb)
250 subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
251
252 if (!mptcp_finish_join(sk))
253 goto do_reset;
254
255 subflow->conn_finished = 1;
256 } else {
257do_reset:
258 tcp_send_active_reset(sk, GFP_ATOMIC);
259 tcp_done(sk);
cec37a6e
PK
260 }
261}
262
263static struct request_sock_ops subflow_request_sock_ops;
264static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;
265
266static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
267{
268 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
269
270 pr_debug("subflow=%p", subflow);
271
272 /* Never answer to SYNs sent to broadcast or multicast */
273 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
274 goto drop;
275
276 return tcp_conn_request(&subflow_request_sock_ops,
277 &subflow_request_sock_ipv4_ops,
278 sk, skb);
279drop:
280 tcp_listendrop(sk);
281 return 0;
282}
283
284#if IS_ENABLED(CONFIG_MPTCP_IPV6)
285static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
286static struct inet_connection_sock_af_ops subflow_v6_specific;
287static struct inet_connection_sock_af_ops subflow_v6m_specific;
288
289static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
290{
291 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
292
293 pr_debug("subflow=%p", subflow);
294
295 if (skb->protocol == htons(ETH_P_IP))
296 return subflow_v4_conn_request(sk, skb);
297
298 if (!ipv6_unicast_destination(skb))
299 goto drop;
300
301 return tcp_conn_request(&subflow_request_sock_ops,
302 &subflow_request_sock_ipv6_ops, sk, skb);
303
304drop:
305 tcp_listendrop(sk);
306 return 0; /* don't send reset */
307}
308#endif
309
f296234c
PK
310/* validate hmac received in third ACK */
311static bool subflow_hmac_valid(const struct request_sock *req,
312 const struct tcp_options_received *rx_opt)
313{
314 const struct mptcp_subflow_request_sock *subflow_req;
315 u8 hmac[MPTCPOPT_HMAC_LEN];
316 struct mptcp_sock *msk;
317 bool ret;
318
319 subflow_req = mptcp_subflow_rsk(req);
320 msk = mptcp_token_get_sock(subflow_req->token);
321 if (!msk)
322 return false;
323
324 subflow_generate_hmac(msk->remote_key, msk->local_key,
325 subflow_req->remote_nonce,
326 subflow_req->local_nonce, hmac);
327
328 ret = true;
329 if (crypto_memneq(hmac, rx_opt->mptcp.hmac, sizeof(hmac)))
330 ret = false;
331
332 sock_put((struct sock *)msk);
333 return ret;
334}
335
cec37a6e
PK
336static struct sock *subflow_syn_recv_sock(const struct sock *sk,
337 struct sk_buff *skb,
338 struct request_sock *req,
339 struct dst_entry *dst,
340 struct request_sock *req_unhash,
341 bool *own_req)
342{
343 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
cc7972ea
CP
344 struct mptcp_subflow_request_sock *subflow_req;
345 struct tcp_options_received opt_rx;
f296234c 346 bool fallback_is_fatal = false;
58b09919 347 struct sock *new_msk = NULL;
cec37a6e
PK
348 struct sock *child;
349
350 pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
351
ae2dd716
FW
352 if (tcp_rsk(req)->is_mptcp == 0)
353 goto create_child;
354
d22f4988 355 /* if the sk is MP_CAPABLE, we try to fetch the client key */
cc7972ea
CP
356 subflow_req = mptcp_subflow_rsk(req);
357 if (subflow_req->mp_capable) {
d22f4988
CP
358 if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) {
359 /* here we can receive and accept an in-window,
360 * out-of-order pkt, which will not carry the MP_CAPABLE
361 * opt even on mptcp enabled paths
362 */
58b09919 363 goto create_msk;
d22f4988
CP
364 }
365
cc7972ea
CP
366 opt_rx.mptcp.mp_capable = 0;
367 mptcp_get_options(skb, &opt_rx);
d22f4988 368 if (opt_rx.mptcp.mp_capable) {
cc7972ea 369 subflow_req->remote_key = opt_rx.mptcp.sndr_key;
d22f4988
CP
370 subflow_req->remote_key_valid = 1;
371 } else {
372 subflow_req->mp_capable = 0;
58b09919 373 goto create_child;
d22f4988 374 }
58b09919
PA
375
376create_msk:
377 new_msk = mptcp_sk_clone(listener->conn, req);
378 if (!new_msk)
379 subflow_req->mp_capable = 0;
f296234c
PK
380 } else if (subflow_req->mp_join) {
381 fallback_is_fatal = true;
382 opt_rx.mptcp.mp_join = 0;
383 mptcp_get_options(skb, &opt_rx);
384 if (!opt_rx.mptcp.mp_join ||
385 !subflow_hmac_valid(req, &opt_rx))
386 return NULL;
cc7972ea 387 }
cec37a6e 388
d22f4988 389create_child:
cec37a6e
PK
390 child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
391 req_unhash, own_req);
392
393 if (child && *own_req) {
79c0949e
PK
394 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
395
f296234c
PK
396 /* we have null ctx on TCP fallback, which is fatal on
397 * MPJ handshake
79c0949e 398 */
f296234c
PK
399 if (!ctx) {
400 if (fallback_is_fatal)
401 goto close_child;
58b09919 402 goto out;
f296234c 403 }
79c0949e
PK
404
405 if (ctx->mp_capable) {
58b09919
PA
406 /* new mpc subflow takes ownership of the newly
407 * created mptcp socket
408 */
1b1c7a0e
PK
409 inet_sk_state_store(new_msk, TCP_ESTABLISHED);
410 mptcp_pm_new_connection(mptcp_sk(new_msk), 1);
58b09919
PA
411 ctx->conn = new_msk;
412 new_msk = NULL;
f296234c
PK
413 } else if (ctx->mp_join) {
414 struct mptcp_sock *owner;
415
416 owner = mptcp_token_get_sock(ctx->token);
417 if (!owner)
418 goto close_child;
419
420 ctx->conn = (struct sock *)owner;
421 if (!mptcp_finish_join(child))
422 goto close_child;
cec37a6e
PK
423 }
424 }
425
58b09919
PA
426out:
427 /* dispose of the left over mptcp master, if any */
428 if (unlikely(new_msk))
429 sock_put(new_msk);
cec37a6e 430 return child;
f296234c
PK
431
432close_child:
433 tcp_send_active_reset(child, GFP_ATOMIC);
434 inet_csk_prepare_forced_close(child);
435 tcp_done(child);
436 return NULL;
cec37a6e
PK
437}
438
439static struct inet_connection_sock_af_ops subflow_specific;
440
648ef4b8
MM
441enum mapping_status {
442 MAPPING_OK,
443 MAPPING_INVALID,
444 MAPPING_EMPTY,
445 MAPPING_DATA_FIN
446};
447
448static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
449{
450 if ((u32)seq == (u32)old_seq)
451 return old_seq;
452
453 /* Assume map covers data not mapped yet. */
454 return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32));
455}
456
457static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
458{
459 WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
460 ssn, subflow->map_subflow_seq, subflow->map_data_len);
461}
462
463static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
464{
465 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
466 unsigned int skb_consumed;
467
468 skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
469 if (WARN_ON_ONCE(skb_consumed >= skb->len))
470 return true;
471
472 return skb->len - skb_consumed <= subflow->map_data_len -
473 mptcp_subflow_get_map_offset(subflow);
474}
475
476static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
477{
478 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
479 u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
480
481 if (unlikely(before(ssn, subflow->map_subflow_seq))) {
482 /* Mapping covers data later in the subflow stream,
483 * currently unsupported.
484 */
485 warn_bad_map(subflow, ssn);
486 return false;
487 }
488 if (unlikely(!before(ssn, subflow->map_subflow_seq +
489 subflow->map_data_len))) {
490 /* Mapping does covers past subflow data, invalid */
491 warn_bad_map(subflow, ssn + skb->len);
492 return false;
493 }
494 return true;
495}
496
497static enum mapping_status get_mapping_status(struct sock *ssk)
498{
499 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
500 struct mptcp_ext *mpext;
501 struct sk_buff *skb;
502 u16 data_len;
503 u64 map_seq;
504
505 skb = skb_peek(&ssk->sk_receive_queue);
506 if (!skb)
507 return MAPPING_EMPTY;
508
509 mpext = mptcp_get_ext(skb);
510 if (!mpext || !mpext->use_map) {
511 if (!subflow->map_valid && !skb->len) {
512 /* the TCP stack deliver 0 len FIN pkt to the receive
513 * queue, that is the only 0len pkts ever expected here,
514 * and we can admit no mapping only for 0 len pkts
515 */
516 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
517 WARN_ONCE(1, "0len seq %d:%d flags %x",
518 TCP_SKB_CB(skb)->seq,
519 TCP_SKB_CB(skb)->end_seq,
520 TCP_SKB_CB(skb)->tcp_flags);
521 sk_eat_skb(ssk, skb);
522 return MAPPING_EMPTY;
523 }
524
525 if (!subflow->map_valid)
526 return MAPPING_INVALID;
527
528 goto validate_seq;
529 }
530
531 pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
532 mpext->data_seq, mpext->dsn64, mpext->subflow_seq,
533 mpext->data_len, mpext->data_fin);
534
535 data_len = mpext->data_len;
536 if (data_len == 0) {
537 pr_err("Infinite mapping not handled");
538 return MAPPING_INVALID;
539 }
540
541 if (mpext->data_fin == 1) {
542 if (data_len == 1) {
543 pr_debug("DATA_FIN with no payload");
544 if (subflow->map_valid) {
545 /* A DATA_FIN might arrive in a DSS
546 * option before the previous mapping
547 * has been fully consumed. Continue
548 * handling the existing mapping.
549 */
550 skb_ext_del(skb, SKB_EXT_MPTCP);
551 return MAPPING_OK;
552 } else {
553 return MAPPING_DATA_FIN;
554 }
555 }
556
557 /* Adjust for DATA_FIN using 1 byte of sequence space */
558 data_len--;
559 }
560
561 if (!mpext->dsn64) {
562 map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
563 mpext->data_seq);
564 pr_debug("expanded seq=%llu", subflow->map_seq);
565 } else {
566 map_seq = mpext->data_seq;
567 }
568
569 if (subflow->map_valid) {
570 /* Allow replacing only with an identical map */
571 if (subflow->map_seq == map_seq &&
572 subflow->map_subflow_seq == mpext->subflow_seq &&
573 subflow->map_data_len == data_len) {
574 skb_ext_del(skb, SKB_EXT_MPTCP);
575 return MAPPING_OK;
576 }
577
578 /* If this skb data are fully covered by the current mapping,
579 * the new map would need caching, which is not supported
580 */
581 if (skb_is_fully_mapped(ssk, skb))
582 return MAPPING_INVALID;
583
584 /* will validate the next map after consuming the current one */
585 return MAPPING_OK;
586 }
587
588 subflow->map_seq = map_seq;
589 subflow->map_subflow_seq = mpext->subflow_seq;
590 subflow->map_data_len = data_len;
591 subflow->map_valid = 1;
d22f4988 592 subflow->mpc_map = mpext->mpc_map;
648ef4b8
MM
593 pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
594 subflow->map_seq, subflow->map_subflow_seq,
595 subflow->map_data_len);
596
597validate_seq:
598 /* we revalidate valid mapping on new skb, because we must ensure
599 * the current skb is completely covered by the available mapping
600 */
601 if (!validate_mapping(ssk, skb))
602 return MAPPING_INVALID;
603
604 skb_ext_del(skb, SKB_EXT_MPTCP);
605 return MAPPING_OK;
606}
607
bfae9dae
FW
608static int subflow_read_actor(read_descriptor_t *desc,
609 struct sk_buff *skb,
610 unsigned int offset, size_t len)
611{
612 size_t copy_len = min(desc->count, len);
613
614 desc->count -= copy_len;
615
616 pr_debug("flushed %zu bytes, %zu left", copy_len, desc->count);
617 return copy_len;
618}
619
648ef4b8
MM
620static bool subflow_check_data_avail(struct sock *ssk)
621{
622 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
623 enum mapping_status status;
624 struct mptcp_sock *msk;
625 struct sk_buff *skb;
626
627 pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
628 subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
629 if (subflow->data_avail)
630 return true;
631
648ef4b8
MM
632 msk = mptcp_sk(subflow->conn);
633 for (;;) {
634 u32 map_remaining;
635 size_t delta;
636 u64 ack_seq;
637 u64 old_ack;
638
639 status = get_mapping_status(ssk);
640 pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status);
641 if (status == MAPPING_INVALID) {
642 ssk->sk_err = EBADMSG;
643 goto fatal;
644 }
645
646 if (status != MAPPING_OK)
647 return false;
648
649 skb = skb_peek(&ssk->sk_receive_queue);
650 if (WARN_ON_ONCE(!skb))
651 return false;
652
d22f4988
CP
653 /* if msk lacks the remote key, this subflow must provide an
654 * MP_CAPABLE-based mapping
655 */
656 if (unlikely(!READ_ONCE(msk->can_ack))) {
657 if (!subflow->mpc_map) {
658 ssk->sk_err = EBADMSG;
659 goto fatal;
660 }
661 WRITE_ONCE(msk->remote_key, subflow->remote_key);
662 WRITE_ONCE(msk->ack_seq, subflow->map_seq);
663 WRITE_ONCE(msk->can_ack, true);
664 }
665
648ef4b8
MM
666 old_ack = READ_ONCE(msk->ack_seq);
667 ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
668 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
669 ack_seq);
670 if (ack_seq == old_ack)
671 break;
672
673 /* only accept in-sequence mapping. Old values are spurious
674 * retransmission; we can hit "future" values on active backup
675 * subflow switch, we relay on retransmissions to get
676 * in-sequence data.
677 * Cuncurrent subflows support will require subflow data
678 * reordering
679 */
680 map_remaining = subflow->map_data_len -
681 mptcp_subflow_get_map_offset(subflow);
682 if (before64(ack_seq, old_ack))
683 delta = min_t(size_t, old_ack - ack_seq, map_remaining);
684 else
685 delta = min_t(size_t, ack_seq - old_ack, map_remaining);
686
687 /* discard mapped data */
688 pr_debug("discarding %zu bytes, current map len=%d", delta,
689 map_remaining);
690 if (delta) {
648ef4b8
MM
691 read_descriptor_t desc = {
692 .count = delta,
648ef4b8
MM
693 };
694 int ret;
695
bfae9dae 696 ret = tcp_read_sock(ssk, &desc, subflow_read_actor);
648ef4b8
MM
697 if (ret < 0) {
698 ssk->sk_err = -ret;
699 goto fatal;
700 }
701 if (ret < delta)
702 return false;
703 if (delta == map_remaining)
704 subflow->map_valid = 0;
705 }
706 }
707 return true;
708
709fatal:
710 /* fatal protocol error, close the socket */
711 /* This barrier is coupled with smp_rmb() in tcp_poll() */
712 smp_wmb();
713 ssk->sk_error_report(ssk);
714 tcp_set_state(ssk, TCP_CLOSE);
715 tcp_send_active_reset(ssk, GFP_ATOMIC);
716 return false;
717}
718
719bool mptcp_subflow_data_available(struct sock *sk)
720{
721 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
722 struct sk_buff *skb;
723
724 /* check if current mapping is still valid */
725 if (subflow->map_valid &&
726 mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
727 subflow->map_valid = 0;
728 subflow->data_avail = 0;
729
730 pr_debug("Done with mapping: seq=%u data_len=%u",
731 subflow->map_subflow_seq,
732 subflow->map_data_len);
733 }
734
735 if (!subflow_check_data_avail(sk)) {
736 subflow->data_avail = 0;
737 return false;
738 }
739
740 skb = skb_peek(&sk->sk_receive_queue);
741 subflow->data_avail = skb &&
742 before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq);
743 return subflow->data_avail;
744}
745
746static void subflow_data_ready(struct sock *sk)
747{
748 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
749 struct sock *parent = subflow->conn;
750
f296234c 751 if (!subflow->mp_capable && !subflow->mp_join) {
648ef4b8
MM
752 subflow->tcp_data_ready(sk);
753
dc093db5 754 parent->sk_data_ready(parent);
648ef4b8
MM
755 return;
756 }
757
101f6f85 758 if (mptcp_subflow_data_available(sk))
2e52213c 759 mptcp_data_ready(parent, sk);
648ef4b8
MM
760}
761
762static void subflow_write_space(struct sock *sk)
763{
764 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
765 struct sock *parent = subflow->conn;
766
767 sk_stream_write_space(sk);
dc093db5 768 if (sk_stream_is_writeable(sk)) {
1891c4a0
FW
769 set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
770 smp_mb__after_atomic();
771 /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
648ef4b8
MM
772 sk_stream_write_space(parent);
773 }
774}
775
cec37a6e
PK
776static struct inet_connection_sock_af_ops *
777subflow_default_af_ops(struct sock *sk)
778{
779#if IS_ENABLED(CONFIG_MPTCP_IPV6)
780 if (sk->sk_family == AF_INET6)
781 return &subflow_v6_specific;
782#endif
783 return &subflow_specific;
784}
785
cec37a6e 786#if IS_ENABLED(CONFIG_MPTCP_IPV6)
31484d56
GU
787void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
788{
cec37a6e
PK
789 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
790 struct inet_connection_sock *icsk = inet_csk(sk);
791 struct inet_connection_sock_af_ops *target;
792
793 target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
794
795 pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
edc7e489 796 subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
cec37a6e
PK
797
798 if (likely(icsk->icsk_af_ops == target))
799 return;
800
801 subflow->icsk_af_ops = icsk->icsk_af_ops;
802 icsk->icsk_af_ops = target;
cec37a6e 803}
31484d56 804#endif
cec37a6e 805
ec3edaa7
PK
806static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
807 struct sockaddr_storage *addr)
808{
809 memset(addr, 0, sizeof(*addr));
810 addr->ss_family = info->family;
811 if (addr->ss_family == AF_INET) {
812 struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
813
814 in_addr->sin_addr = info->addr;
815 in_addr->sin_port = info->port;
816 }
817#if IS_ENABLED(CONFIG_MPTCP_IPV6)
818 else if (addr->ss_family == AF_INET6) {
819 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
820
821 in6_addr->sin6_addr = info->addr6;
822 in6_addr->sin6_port = info->port;
823 }
824#endif
825}
826
827int __mptcp_subflow_connect(struct sock *sk, int ifindex,
828 const struct mptcp_addr_info *loc,
829 const struct mptcp_addr_info *remote)
830{
831 struct mptcp_sock *msk = mptcp_sk(sk);
832 struct mptcp_subflow_context *subflow;
833 struct sockaddr_storage addr;
834 struct socket *sf;
835 u32 remote_token;
836 int addrlen;
837 int err;
838
839 if (sk->sk_state != TCP_ESTABLISHED)
840 return -ENOTCONN;
841
842 err = mptcp_subflow_create_socket(sk, &sf);
843 if (err)
844 return err;
845
846 subflow = mptcp_subflow_ctx(sf->sk);
847 subflow->remote_key = msk->remote_key;
848 subflow->local_key = msk->local_key;
849 subflow->token = msk->token;
850 mptcp_info2sockaddr(loc, &addr);
851
852 addrlen = sizeof(struct sockaddr_in);
853#if IS_ENABLED(CONFIG_MPTCP_IPV6)
854 if (loc->family == AF_INET6)
855 addrlen = sizeof(struct sockaddr_in6);
856#endif
857 sf->sk->sk_bound_dev_if = ifindex;
858 err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
859 if (err)
860 goto failed;
861
862 mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
863 pr_debug("msk=%p remote_token=%u", msk, remote_token);
864 subflow->remote_token = remote_token;
865 subflow->local_id = loc->id;
866 subflow->request_join = 1;
867 subflow->request_bkup = 1;
868 mptcp_info2sockaddr(remote, &addr);
869
870 err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
871 if (err && err != -EINPROGRESS)
872 goto failed;
873
874 spin_lock_bh(&msk->join_list_lock);
875 list_add_tail(&subflow->node, &msk->join_list);
876 spin_unlock_bh(&msk->join_list_lock);
877
878 return err;
879
880failed:
881 sock_release(sf);
882 return err;
883}
884
2303f994
PK
885int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
886{
887 struct mptcp_subflow_context *subflow;
888 struct net *net = sock_net(sk);
889 struct socket *sf;
890 int err;
891
cec37a6e
PK
892 err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
893 &sf);
2303f994
PK
894 if (err)
895 return err;
896
897 lock_sock(sf->sk);
898
899 /* kernel sockets do not by default acquire net ref, but TCP timer
900 * needs it.
901 */
902 sf->sk->sk_net_refcnt = 1;
903 get_net(net);
f6f7d8cf 904#ifdef CONFIG_PROC_FS
2303f994 905 this_cpu_add(*net->core.sock_inuse, 1);
f6f7d8cf 906#endif
2303f994
PK
907 err = tcp_set_ulp(sf->sk, "mptcp");
908 release_sock(sf->sk);
909
910 if (err)
911 return err;
912
913 subflow = mptcp_subflow_ctx(sf->sk);
914 pr_debug("subflow=%p", subflow);
915
916 *new_sock = sf;
79c0949e 917 sock_hold(sk);
2303f994
PK
918 subflow->conn = sk;
919
920 return 0;
921}
922
923static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
924 gfp_t priority)
925{
926 struct inet_connection_sock *icsk = inet_csk(sk);
927 struct mptcp_subflow_context *ctx;
928
929 ctx = kzalloc(sizeof(*ctx), priority);
930 if (!ctx)
931 return NULL;
932
933 rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
cec37a6e 934 INIT_LIST_HEAD(&ctx->node);
2303f994
PK
935
936 pr_debug("subflow=%p", ctx);
937
938 ctx->tcp_sock = sk;
939
940 return ctx;
941}
942
648ef4b8
MM
943static void __subflow_state_change(struct sock *sk)
944{
945 struct socket_wq *wq;
946
947 rcu_read_lock();
948 wq = rcu_dereference(sk->sk_wq);
949 if (skwq_has_sleeper(wq))
950 wake_up_interruptible_all(&wq->wait);
951 rcu_read_unlock();
952}
953
954static bool subflow_is_done(const struct sock *sk)
955{
956 return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
957}
958
959static void subflow_state_change(struct sock *sk)
960{
961 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
dc093db5 962 struct sock *parent = subflow->conn;
648ef4b8
MM
963
964 __subflow_state_change(sk);
965
966 /* as recvmsg() does not acquire the subflow socket for ssk selection
967 * a fin packet carrying a DSS can be unnoticed if we don't trigger
968 * the data available machinery here.
969 */
dc093db5 970 if (subflow->mp_capable && mptcp_subflow_data_available(sk))
2e52213c 971 mptcp_data_ready(parent, sk);
648ef4b8 972
dc093db5 973 if (!(parent->sk_shutdown & RCV_SHUTDOWN) &&
648ef4b8
MM
974 !subflow->rx_eof && subflow_is_done(sk)) {
975 subflow->rx_eof = 1;
976 parent->sk_shutdown |= RCV_SHUTDOWN;
977 __subflow_state_change(parent);
978 }
979}
980
2303f994
PK
981static int subflow_ulp_init(struct sock *sk)
982{
cec37a6e 983 struct inet_connection_sock *icsk = inet_csk(sk);
2303f994
PK
984 struct mptcp_subflow_context *ctx;
985 struct tcp_sock *tp = tcp_sk(sk);
986 int err = 0;
987
988 /* disallow attaching ULP to a socket unless it has been
989 * created with sock_create_kern()
990 */
991 if (!sk->sk_kern_sock) {
992 err = -EOPNOTSUPP;
993 goto out;
994 }
995
996 ctx = subflow_create_ctx(sk, GFP_KERNEL);
997 if (!ctx) {
998 err = -ENOMEM;
999 goto out;
1000 }
1001
1002 pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
1003
1004 tp->is_mptcp = 1;
cec37a6e
PK
1005 ctx->icsk_af_ops = icsk->icsk_af_ops;
1006 icsk->icsk_af_ops = subflow_default_af_ops(sk);
648ef4b8
MM
1007 ctx->tcp_data_ready = sk->sk_data_ready;
1008 ctx->tcp_state_change = sk->sk_state_change;
1009 ctx->tcp_write_space = sk->sk_write_space;
1010 sk->sk_data_ready = subflow_data_ready;
1011 sk->sk_write_space = subflow_write_space;
1012 sk->sk_state_change = subflow_state_change;
2303f994
PK
1013out:
1014 return err;
1015}
1016
1017static void subflow_ulp_release(struct sock *sk)
1018{
1019 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk);
1020
1021 if (!ctx)
1022 return;
1023
79c0949e
PK
1024 if (ctx->conn)
1025 sock_put(ctx->conn);
1026
2303f994
PK
1027 kfree_rcu(ctx, rcu);
1028}
1029
648ef4b8
MM
1030static void subflow_ulp_fallback(struct sock *sk,
1031 struct mptcp_subflow_context *old_ctx)
cec37a6e
PK
1032{
1033 struct inet_connection_sock *icsk = inet_csk(sk);
1034
648ef4b8 1035 mptcp_subflow_tcp_fallback(sk, old_ctx);
cec37a6e
PK
1036 icsk->icsk_ulp_ops = NULL;
1037 rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
1038 tcp_sk(sk)->is_mptcp = 0;
1039}
1040
1041static void subflow_ulp_clone(const struct request_sock *req,
1042 struct sock *newsk,
1043 const gfp_t priority)
1044{
1045 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
1046 struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
1047 struct mptcp_subflow_context *new_ctx;
1048
f296234c
PK
1049 if (!tcp_rsk(req)->is_mptcp ||
1050 (!subflow_req->mp_capable && !subflow_req->mp_join)) {
648ef4b8 1051 subflow_ulp_fallback(newsk, old_ctx);
cec37a6e
PK
1052 return;
1053 }
1054
1055 new_ctx = subflow_create_ctx(newsk, priority);
edc7e489 1056 if (!new_ctx) {
648ef4b8 1057 subflow_ulp_fallback(newsk, old_ctx);
cec37a6e
PK
1058 return;
1059 }
1060
1061 new_ctx->conn_finished = 1;
1062 new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
648ef4b8
MM
1063 new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
1064 new_ctx->tcp_state_change = old_ctx->tcp_state_change;
1065 new_ctx->tcp_write_space = old_ctx->tcp_write_space;
58b09919
PA
1066 new_ctx->rel_write_seq = 1;
1067 new_ctx->tcp_sock = newsk;
1068
f296234c
PK
1069 if (subflow_req->mp_capable) {
1070 /* see comments in subflow_syn_recv_sock(), MPTCP connection
1071 * is fully established only after we receive the remote key
1072 */
1073 new_ctx->mp_capable = 1;
1074 new_ctx->fully_established = subflow_req->remote_key_valid;
1075 new_ctx->can_ack = subflow_req->remote_key_valid;
1076 new_ctx->remote_key = subflow_req->remote_key;
1077 new_ctx->local_key = subflow_req->local_key;
1078 new_ctx->token = subflow_req->token;
1079 new_ctx->ssn_offset = subflow_req->ssn_offset;
1080 new_ctx->idsn = subflow_req->idsn;
1081 } else if (subflow_req->mp_join) {
ec3edaa7 1082 new_ctx->ssn_offset = subflow_req->ssn_offset;
f296234c
PK
1083 new_ctx->mp_join = 1;
1084 new_ctx->fully_established = 1;
1085 new_ctx->backup = subflow_req->backup;
1086 new_ctx->local_id = subflow_req->local_id;
1087 new_ctx->token = subflow_req->token;
1088 new_ctx->thmac = subflow_req->thmac;
1089 }
cec37a6e
PK
1090}
1091
2303f994
PK
1092static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
1093 .name = "mptcp",
1094 .owner = THIS_MODULE,
1095 .init = subflow_ulp_init,
1096 .release = subflow_ulp_release,
cec37a6e 1097 .clone = subflow_ulp_clone,
2303f994
PK
1098};
1099
cec37a6e
PK
1100static int subflow_ops_init(struct request_sock_ops *subflow_ops)
1101{
1102 subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
1103 subflow_ops->slab_name = "request_sock_subflow";
1104
1105 subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
1106 subflow_ops->obj_size, 0,
1107 SLAB_ACCOUNT |
1108 SLAB_TYPESAFE_BY_RCU,
1109 NULL);
1110 if (!subflow_ops->slab)
1111 return -ENOMEM;
1112
79c0949e
PK
1113 subflow_ops->destructor = subflow_req_destructor;
1114
cec37a6e
PK
1115 return 0;
1116}
1117
2303f994
PK
1118void mptcp_subflow_init(void)
1119{
cec37a6e
PK
1120 subflow_request_sock_ops = tcp_request_sock_ops;
1121 if (subflow_ops_init(&subflow_request_sock_ops) != 0)
1122 panic("MPTCP: failed to init subflow request sock ops\n");
1123
1124 subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
1125 subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req;
1126
1127 subflow_specific = ipv4_specific;
1128 subflow_specific.conn_request = subflow_v4_conn_request;
1129 subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
1130 subflow_specific.sk_rx_dst_set = subflow_finish_connect;
79c0949e 1131 subflow_specific.rebuild_header = subflow_rebuild_header;
cec37a6e
PK
1132
1133#if IS_ENABLED(CONFIG_MPTCP_IPV6)
1134 subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
1135 subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req;
1136
1137 subflow_v6_specific = ipv6_specific;
1138 subflow_v6_specific.conn_request = subflow_v6_conn_request;
1139 subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
1140 subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
79c0949e 1141 subflow_v6_specific.rebuild_header = subflow_rebuild_header;
cec37a6e
PK
1142
1143 subflow_v6m_specific = subflow_v6_specific;
1144 subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
1145 subflow_v6m_specific.send_check = ipv4_specific.send_check;
1146 subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
1147 subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
1148 subflow_v6m_specific.net_frag_header_len = 0;
1149#endif
1150
5147dfb5
DC
1151 mptcp_diag_subflow_init(&subflow_ulp_ops);
1152
2303f994
PK
1153 if (tcp_register_ulp(&subflow_ulp_ops) != 0)
1154 panic("MPTCP: failed to register subflows to ULP\n");
1155}