mptcp: avoid flipping mp_capable field in syn_recv_sock()
[linux-block.git] / net / mptcp / subflow.c
CommitLineData
2303f994
PK
1// SPDX-License-Identifier: GPL-2.0
2/* Multipath TCP
3 *
4 * Copyright (c) 2017 - 2019, Intel Corporation.
5 */
6
79c0949e
PK
7#define pr_fmt(fmt) "MPTCP: " fmt
8
2303f994
PK
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/netdevice.h>
f296234c 12#include <crypto/algapi.h>
2303f994
PK
13#include <net/sock.h>
14#include <net/inet_common.h>
15#include <net/inet_hashtables.h>
16#include <net/protocol.h>
17#include <net/tcp.h>
cec37a6e
PK
18#if IS_ENABLED(CONFIG_MPTCP_IPV6)
19#include <net/ip6_route.h>
20#endif
2303f994
PK
21#include <net/mptcp.h>
22#include "protocol.h"
fc518953
FW
23#include "mib.h"
24
25static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
26 enum linux_mptcp_mib_field field)
27{
28 MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
29}
2303f994 30
79c0949e
PK
31static int subflow_rebuild_header(struct sock *sk)
32{
33 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
ec3edaa7 34 int local_id, err = 0;
79c0949e
PK
35
36 if (subflow->request_mptcp && !subflow->token) {
37 pr_debug("subflow=%p", sk);
38 err = mptcp_token_new_connect(sk);
ec3edaa7
PK
39 } else if (subflow->request_join && !subflow->local_nonce) {
40 struct mptcp_sock *msk = (struct mptcp_sock *)subflow->conn;
41
42 pr_debug("subflow=%p", sk);
43
44 do {
45 get_random_bytes(&subflow->local_nonce, sizeof(u32));
46 } while (!subflow->local_nonce);
47
48 if (subflow->local_id)
49 goto out;
50
51 local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)sk);
52 if (local_id < 0)
53 return -EINVAL;
54
55 subflow->local_id = local_id;
79c0949e
PK
56 }
57
ec3edaa7 58out:
79c0949e
PK
59 if (err)
60 return err;
61
62 return subflow->icsk_af_ops->rebuild_header(sk);
63}
64
65static void subflow_req_destructor(struct request_sock *req)
66{
67 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
68
69 pr_debug("subflow_req=%p", subflow_req);
70
71 if (subflow_req->mp_capable)
72 mptcp_token_destroy_request(subflow_req->token);
73 tcp_request_sock_ops.destructor(req);
74}
75
f296234c
PK
76static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
77 void *hmac)
78{
79 u8 msg[8];
80
81 put_unaligned_be32(nonce1, &msg[0]);
82 put_unaligned_be32(nonce2, &msg[4]);
83
84 mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
85}
86
87/* validate received token and create truncated hmac and nonce for SYN-ACK */
88static bool subflow_token_join_request(struct request_sock *req,
89 const struct sk_buff *skb)
90{
91 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
92 u8 hmac[MPTCPOPT_HMAC_LEN];
93 struct mptcp_sock *msk;
94 int local_id;
95
96 msk = mptcp_token_get_sock(subflow_req->token);
97 if (!msk) {
fc518953 98 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN);
f296234c
PK
99 return false;
100 }
101
102 local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
103 if (local_id < 0) {
104 sock_put((struct sock *)msk);
105 return false;
106 }
107 subflow_req->local_id = local_id;
108
109 get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
110
111 subflow_generate_hmac(msk->local_key, msk->remote_key,
112 subflow_req->local_nonce,
113 subflow_req->remote_nonce, hmac);
114
115 subflow_req->thmac = get_unaligned_be64(hmac);
116
117 sock_put((struct sock *)msk);
118 return true;
119}
120
cec37a6e
PK
121static void subflow_init_req(struct request_sock *req,
122 const struct sock *sk_listener,
123 struct sk_buff *skb)
124{
125 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
126 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
127 struct tcp_options_received rx_opt;
128
129 pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
130
131 memset(&rx_opt.mptcp, 0, sizeof(rx_opt.mptcp));
132 mptcp_get_options(skb, &rx_opt);
133
134 subflow_req->mp_capable = 0;
f296234c 135 subflow_req->mp_join = 0;
d22f4988 136 subflow_req->remote_key_valid = 0;
cec37a6e
PK
137
138#ifdef CONFIG_TCP_MD5SIG
139 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
140 * TCP option space.
141 */
142 if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
143 return;
144#endif
145
fc518953
FW
146 if (rx_opt.mptcp.mp_capable) {
147 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
148
149 if (rx_opt.mptcp.mp_join)
150 return;
151 } else if (rx_opt.mptcp.mp_join) {
152 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
153 }
f296234c 154
cec37a6e 155 if (rx_opt.mptcp.mp_capable && listener->request_mptcp) {
79c0949e
PK
156 int err;
157
158 err = mptcp_token_new_request(req);
159 if (err == 0)
160 subflow_req->mp_capable = 1;
161
648ef4b8 162 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
f296234c 163 } else if (rx_opt.mptcp.mp_join && listener->request_mptcp) {
ec3edaa7 164 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
f296234c
PK
165 subflow_req->mp_join = 1;
166 subflow_req->backup = rx_opt.mptcp.backup;
167 subflow_req->remote_id = rx_opt.mptcp.join_id;
168 subflow_req->token = rx_opt.mptcp.token;
169 subflow_req->remote_nonce = rx_opt.mptcp.nonce;
170 pr_debug("token=%u, remote_nonce=%u", subflow_req->token,
171 subflow_req->remote_nonce);
172 if (!subflow_token_join_request(req, skb)) {
173 subflow_req->mp_join = 0;
174 // @@ need to trigger RST
175 }
cec37a6e
PK
176 }
177}
178
179static void subflow_v4_init_req(struct request_sock *req,
180 const struct sock *sk_listener,
181 struct sk_buff *skb)
182{
183 tcp_rsk(req)->is_mptcp = 1;
184
185 tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb);
186
187 subflow_init_req(req, sk_listener, skb);
188}
189
190#if IS_ENABLED(CONFIG_MPTCP_IPV6)
191static void subflow_v6_init_req(struct request_sock *req,
192 const struct sock *sk_listener,
193 struct sk_buff *skb)
194{
195 tcp_rsk(req)->is_mptcp = 1;
196
197 tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb);
198
199 subflow_init_req(req, sk_listener, skb);
200}
201#endif
202
ec3edaa7
PK
203/* validate received truncated hmac and create hmac for third ACK */
204static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
205{
206 u8 hmac[MPTCPOPT_HMAC_LEN];
207 u64 thmac;
208
209 subflow_generate_hmac(subflow->remote_key, subflow->local_key,
210 subflow->remote_nonce, subflow->local_nonce,
211 hmac);
212
213 thmac = get_unaligned_be64(hmac);
214 pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
215 subflow, subflow->token,
216 (unsigned long long)thmac,
217 (unsigned long long)subflow->thmac);
218
219 return thmac == subflow->thmac;
220}
221
cec37a6e
PK
222static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
223{
224 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
c3c123d1 225 struct sock *parent = subflow->conn;
cec37a6e
PK
226
227 subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
228
c3c123d1
DC
229 if (inet_sk_state_load(parent) != TCP_ESTABLISHED) {
230 inet_sk_state_store(parent, TCP_ESTABLISHED);
231 parent->sk_state_change(parent);
232 }
233
ec3edaa7
PK
234 if (subflow->conn_finished || !tcp_sk(sk)->is_mptcp)
235 return;
236
237 if (subflow->mp_capable) {
cec37a6e
PK
238 pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
239 subflow->remote_key);
240 mptcp_finish_connect(sk);
241 subflow->conn_finished = 1;
648ef4b8
MM
242
243 if (skb) {
244 pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
245 subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
246 }
ec3edaa7
PK
247 } else if (subflow->mp_join) {
248 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u",
249 subflow, subflow->thmac,
250 subflow->remote_nonce);
251 if (!subflow_thmac_valid(subflow)) {
fc518953 252 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
ec3edaa7
PK
253 subflow->mp_join = 0;
254 goto do_reset;
255 }
256
257 subflow_generate_hmac(subflow->local_key, subflow->remote_key,
258 subflow->local_nonce,
259 subflow->remote_nonce,
260 subflow->hmac);
261
262 if (skb)
263 subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
264
265 if (!mptcp_finish_join(sk))
266 goto do_reset;
267
268 subflow->conn_finished = 1;
fc518953 269 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
ec3edaa7
PK
270 } else {
271do_reset:
272 tcp_send_active_reset(sk, GFP_ATOMIC);
273 tcp_done(sk);
cec37a6e
PK
274 }
275}
276
277static struct request_sock_ops subflow_request_sock_ops;
278static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;
279
280static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
281{
282 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
283
284 pr_debug("subflow=%p", subflow);
285
286 /* Never answer to SYNs sent to broadcast or multicast */
287 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
288 goto drop;
289
290 return tcp_conn_request(&subflow_request_sock_ops,
291 &subflow_request_sock_ipv4_ops,
292 sk, skb);
293drop:
294 tcp_listendrop(sk);
295 return 0;
296}
297
298#if IS_ENABLED(CONFIG_MPTCP_IPV6)
299static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
300static struct inet_connection_sock_af_ops subflow_v6_specific;
301static struct inet_connection_sock_af_ops subflow_v6m_specific;
302
303static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
304{
305 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
306
307 pr_debug("subflow=%p", subflow);
308
309 if (skb->protocol == htons(ETH_P_IP))
310 return subflow_v4_conn_request(sk, skb);
311
312 if (!ipv6_unicast_destination(skb))
313 goto drop;
314
315 return tcp_conn_request(&subflow_request_sock_ops,
316 &subflow_request_sock_ipv6_ops, sk, skb);
317
318drop:
319 tcp_listendrop(sk);
320 return 0; /* don't send reset */
321}
322#endif
323
f296234c
PK
324/* validate hmac received in third ACK */
325static bool subflow_hmac_valid(const struct request_sock *req,
326 const struct tcp_options_received *rx_opt)
327{
328 const struct mptcp_subflow_request_sock *subflow_req;
329 u8 hmac[MPTCPOPT_HMAC_LEN];
330 struct mptcp_sock *msk;
331 bool ret;
332
333 subflow_req = mptcp_subflow_rsk(req);
334 msk = mptcp_token_get_sock(subflow_req->token);
335 if (!msk)
336 return false;
337
338 subflow_generate_hmac(msk->remote_key, msk->local_key,
339 subflow_req->remote_nonce,
340 subflow_req->local_nonce, hmac);
341
342 ret = true;
343 if (crypto_memneq(hmac, rx_opt->mptcp.hmac, sizeof(hmac)))
344 ret = false;
345
346 sock_put((struct sock *)msk);
347 return ret;
348}
349
df1036da
FW
350static void mptcp_sock_destruct(struct sock *sk)
351{
352 /* if new mptcp socket isn't accepted, it is free'd
353 * from the tcp listener sockets request queue, linked
354 * from req->sk. The tcp socket is released.
355 * This calls the ULP release function which will
356 * also remove the mptcp socket, via
357 * sock_put(ctx->conn).
358 *
359 * Problem is that the mptcp socket will not be in
360 * SYN_RECV state and doesn't have SOCK_DEAD flag.
361 * Both result in warnings from inet_sock_destruct.
362 */
363
364 if (sk->sk_state == TCP_SYN_RECV) {
365 sk->sk_state = TCP_CLOSE;
366 WARN_ON_ONCE(sk->sk_socket);
367 sock_orphan(sk);
368 }
369
370 inet_sock_destruct(sk);
371}
372
9f5ca6a5
FW
373static void mptcp_force_close(struct sock *sk)
374{
375 inet_sk_state_store(sk, TCP_CLOSE);
376 sk_common_release(sk);
377}
378
4c8941de
PA
379static void subflow_ulp_fallback(struct sock *sk,
380 struct mptcp_subflow_context *old_ctx)
381{
382 struct inet_connection_sock *icsk = inet_csk(sk);
383
384 mptcp_subflow_tcp_fallback(sk, old_ctx);
385 icsk->icsk_ulp_ops = NULL;
386 rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
387 tcp_sk(sk)->is_mptcp = 0;
388}
389
cec37a6e
PK
390static struct sock *subflow_syn_recv_sock(const struct sock *sk,
391 struct sk_buff *skb,
392 struct request_sock *req,
393 struct dst_entry *dst,
394 struct request_sock *req_unhash,
395 bool *own_req)
396{
397 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
cc7972ea
CP
398 struct mptcp_subflow_request_sock *subflow_req;
399 struct tcp_options_received opt_rx;
f296234c 400 bool fallback_is_fatal = false;
58b09919 401 struct sock *new_msk = NULL;
4c8941de 402 bool fallback = false;
cec37a6e
PK
403 struct sock *child;
404
405 pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
406
ae2dd716
FW
407 if (tcp_rsk(req)->is_mptcp == 0)
408 goto create_child;
409
d22f4988 410 /* if the sk is MP_CAPABLE, we try to fetch the client key */
cc7972ea
CP
411 subflow_req = mptcp_subflow_rsk(req);
412 if (subflow_req->mp_capable) {
d22f4988
CP
413 if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) {
414 /* here we can receive and accept an in-window,
415 * out-of-order pkt, which will not carry the MP_CAPABLE
416 * opt even on mptcp enabled paths
417 */
58b09919 418 goto create_msk;
d22f4988
CP
419 }
420
cc7972ea
CP
421 opt_rx.mptcp.mp_capable = 0;
422 mptcp_get_options(skb, &opt_rx);
d22f4988 423 if (opt_rx.mptcp.mp_capable) {
cc7972ea 424 subflow_req->remote_key = opt_rx.mptcp.sndr_key;
d22f4988
CP
425 subflow_req->remote_key_valid = 1;
426 } else {
4c8941de 427 fallback = true;
58b09919 428 goto create_child;
d22f4988 429 }
58b09919
PA
430
431create_msk:
432 new_msk = mptcp_sk_clone(listener->conn, req);
433 if (!new_msk)
4c8941de 434 fallback = true;
f296234c
PK
435 } else if (subflow_req->mp_join) {
436 fallback_is_fatal = true;
437 opt_rx.mptcp.mp_join = 0;
438 mptcp_get_options(skb, &opt_rx);
439 if (!opt_rx.mptcp.mp_join ||
fc518953
FW
440 !subflow_hmac_valid(req, &opt_rx)) {
441 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
f296234c 442 return NULL;
fc518953 443 }
cc7972ea 444 }
cec37a6e 445
d22f4988 446create_child:
cec37a6e
PK
447 child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
448 req_unhash, own_req);
449
450 if (child && *own_req) {
79c0949e
PK
451 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
452
4c8941de
PA
453 /* we need to fallback on ctx allocation failure and on pre-reqs
454 * checking above. In the latter scenario we additionally need
455 * to reset the context to non MPTCP status.
79c0949e 456 */
4c8941de 457 if (!ctx || fallback) {
f296234c
PK
458 if (fallback_is_fatal)
459 goto close_child;
4c8941de
PA
460
461 if (ctx) {
462 subflow_ulp_fallback(child, ctx);
463 kfree_rcu(ctx, rcu);
464 }
58b09919 465 goto out;
f296234c 466 }
79c0949e
PK
467
468 if (ctx->mp_capable) {
58b09919
PA
469 /* new mpc subflow takes ownership of the newly
470 * created mptcp socket
471 */
df1036da 472 new_msk->sk_destruct = mptcp_sock_destruct;
1b1c7a0e 473 mptcp_pm_new_connection(mptcp_sk(new_msk), 1);
58b09919
PA
474 ctx->conn = new_msk;
475 new_msk = NULL;
f296234c
PK
476 } else if (ctx->mp_join) {
477 struct mptcp_sock *owner;
478
479 owner = mptcp_token_get_sock(ctx->token);
480 if (!owner)
481 goto close_child;
482
483 ctx->conn = (struct sock *)owner;
484 if (!mptcp_finish_join(child))
485 goto close_child;
fc518953
FW
486
487 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
cec37a6e
PK
488 }
489 }
490
58b09919
PA
491out:
492 /* dispose of the left over mptcp master, if any */
493 if (unlikely(new_msk))
9f5ca6a5 494 mptcp_force_close(new_msk);
4c8941de
PA
495
496 /* check for expected invariant - should never trigger, just help
497 * catching eariler subtle bugs
498 */
499 WARN_ON_ONCE(*own_req && child && tcp_sk(child)->is_mptcp &&
500 (!mptcp_subflow_ctx(child) ||
501 !mptcp_subflow_ctx(child)->conn));
cec37a6e 502 return child;
f296234c
PK
503
504close_child:
505 tcp_send_active_reset(child, GFP_ATOMIC);
506 inet_csk_prepare_forced_close(child);
507 tcp_done(child);
508 return NULL;
cec37a6e
PK
509}
510
511static struct inet_connection_sock_af_ops subflow_specific;
512
648ef4b8
MM
513enum mapping_status {
514 MAPPING_OK,
515 MAPPING_INVALID,
516 MAPPING_EMPTY,
517 MAPPING_DATA_FIN
518};
519
520static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
521{
522 if ((u32)seq == (u32)old_seq)
523 return old_seq;
524
525 /* Assume map covers data not mapped yet. */
526 return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32));
527}
528
529static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
530{
531 WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
532 ssn, subflow->map_subflow_seq, subflow->map_data_len);
533}
534
535static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
536{
537 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
538 unsigned int skb_consumed;
539
540 skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
541 if (WARN_ON_ONCE(skb_consumed >= skb->len))
542 return true;
543
544 return skb->len - skb_consumed <= subflow->map_data_len -
545 mptcp_subflow_get_map_offset(subflow);
546}
547
548static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
549{
550 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
551 u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
552
553 if (unlikely(before(ssn, subflow->map_subflow_seq))) {
554 /* Mapping covers data later in the subflow stream,
555 * currently unsupported.
556 */
557 warn_bad_map(subflow, ssn);
558 return false;
559 }
560 if (unlikely(!before(ssn, subflow->map_subflow_seq +
561 subflow->map_data_len))) {
562 /* Mapping does covers past subflow data, invalid */
563 warn_bad_map(subflow, ssn + skb->len);
564 return false;
565 }
566 return true;
567}
568
569static enum mapping_status get_mapping_status(struct sock *ssk)
570{
571 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
572 struct mptcp_ext *mpext;
573 struct sk_buff *skb;
574 u16 data_len;
575 u64 map_seq;
576
577 skb = skb_peek(&ssk->sk_receive_queue);
578 if (!skb)
579 return MAPPING_EMPTY;
580
581 mpext = mptcp_get_ext(skb);
582 if (!mpext || !mpext->use_map) {
583 if (!subflow->map_valid && !skb->len) {
584 /* the TCP stack deliver 0 len FIN pkt to the receive
585 * queue, that is the only 0len pkts ever expected here,
586 * and we can admit no mapping only for 0 len pkts
587 */
588 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
589 WARN_ONCE(1, "0len seq %d:%d flags %x",
590 TCP_SKB_CB(skb)->seq,
591 TCP_SKB_CB(skb)->end_seq,
592 TCP_SKB_CB(skb)->tcp_flags);
593 sk_eat_skb(ssk, skb);
594 return MAPPING_EMPTY;
595 }
596
597 if (!subflow->map_valid)
598 return MAPPING_INVALID;
599
600 goto validate_seq;
601 }
602
603 pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
604 mpext->data_seq, mpext->dsn64, mpext->subflow_seq,
605 mpext->data_len, mpext->data_fin);
606
607 data_len = mpext->data_len;
608 if (data_len == 0) {
609 pr_err("Infinite mapping not handled");
fc518953 610 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
648ef4b8
MM
611 return MAPPING_INVALID;
612 }
613
614 if (mpext->data_fin == 1) {
615 if (data_len == 1) {
616 pr_debug("DATA_FIN with no payload");
617 if (subflow->map_valid) {
618 /* A DATA_FIN might arrive in a DSS
619 * option before the previous mapping
620 * has been fully consumed. Continue
621 * handling the existing mapping.
622 */
623 skb_ext_del(skb, SKB_EXT_MPTCP);
624 return MAPPING_OK;
625 } else {
626 return MAPPING_DATA_FIN;
627 }
628 }
629
630 /* Adjust for DATA_FIN using 1 byte of sequence space */
631 data_len--;
632 }
633
634 if (!mpext->dsn64) {
635 map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
636 mpext->data_seq);
637 pr_debug("expanded seq=%llu", subflow->map_seq);
638 } else {
639 map_seq = mpext->data_seq;
640 }
641
642 if (subflow->map_valid) {
643 /* Allow replacing only with an identical map */
644 if (subflow->map_seq == map_seq &&
645 subflow->map_subflow_seq == mpext->subflow_seq &&
646 subflow->map_data_len == data_len) {
647 skb_ext_del(skb, SKB_EXT_MPTCP);
648 return MAPPING_OK;
649 }
650
651 /* If this skb data are fully covered by the current mapping,
652 * the new map would need caching, which is not supported
653 */
fc518953
FW
654 if (skb_is_fully_mapped(ssk, skb)) {
655 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH);
648ef4b8 656 return MAPPING_INVALID;
fc518953 657 }
648ef4b8
MM
658
659 /* will validate the next map after consuming the current one */
660 return MAPPING_OK;
661 }
662
663 subflow->map_seq = map_seq;
664 subflow->map_subflow_seq = mpext->subflow_seq;
665 subflow->map_data_len = data_len;
666 subflow->map_valid = 1;
d22f4988 667 subflow->mpc_map = mpext->mpc_map;
648ef4b8
MM
668 pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
669 subflow->map_seq, subflow->map_subflow_seq,
670 subflow->map_data_len);
671
672validate_seq:
673 /* we revalidate valid mapping on new skb, because we must ensure
674 * the current skb is completely covered by the available mapping
675 */
676 if (!validate_mapping(ssk, skb))
677 return MAPPING_INVALID;
678
679 skb_ext_del(skb, SKB_EXT_MPTCP);
680 return MAPPING_OK;
681}
682
bfae9dae
FW
683static int subflow_read_actor(read_descriptor_t *desc,
684 struct sk_buff *skb,
685 unsigned int offset, size_t len)
686{
687 size_t copy_len = min(desc->count, len);
688
689 desc->count -= copy_len;
690
691 pr_debug("flushed %zu bytes, %zu left", copy_len, desc->count);
692 return copy_len;
693}
694
648ef4b8
MM
695static bool subflow_check_data_avail(struct sock *ssk)
696{
697 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
698 enum mapping_status status;
699 struct mptcp_sock *msk;
700 struct sk_buff *skb;
701
702 pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
703 subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
704 if (subflow->data_avail)
705 return true;
706
648ef4b8
MM
707 msk = mptcp_sk(subflow->conn);
708 for (;;) {
709 u32 map_remaining;
710 size_t delta;
711 u64 ack_seq;
712 u64 old_ack;
713
714 status = get_mapping_status(ssk);
715 pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status);
716 if (status == MAPPING_INVALID) {
717 ssk->sk_err = EBADMSG;
718 goto fatal;
719 }
720
721 if (status != MAPPING_OK)
722 return false;
723
724 skb = skb_peek(&ssk->sk_receive_queue);
725 if (WARN_ON_ONCE(!skb))
726 return false;
727
d22f4988
CP
728 /* if msk lacks the remote key, this subflow must provide an
729 * MP_CAPABLE-based mapping
730 */
731 if (unlikely(!READ_ONCE(msk->can_ack))) {
732 if (!subflow->mpc_map) {
733 ssk->sk_err = EBADMSG;
734 goto fatal;
735 }
736 WRITE_ONCE(msk->remote_key, subflow->remote_key);
737 WRITE_ONCE(msk->ack_seq, subflow->map_seq);
738 WRITE_ONCE(msk->can_ack, true);
739 }
740
648ef4b8
MM
741 old_ack = READ_ONCE(msk->ack_seq);
742 ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
743 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
744 ack_seq);
745 if (ack_seq == old_ack)
746 break;
747
748 /* only accept in-sequence mapping. Old values are spurious
749 * retransmission; we can hit "future" values on active backup
750 * subflow switch, we relay on retransmissions to get
751 * in-sequence data.
752 * Cuncurrent subflows support will require subflow data
753 * reordering
754 */
755 map_remaining = subflow->map_data_len -
756 mptcp_subflow_get_map_offset(subflow);
757 if (before64(ack_seq, old_ack))
758 delta = min_t(size_t, old_ack - ack_seq, map_remaining);
759 else
760 delta = min_t(size_t, ack_seq - old_ack, map_remaining);
761
762 /* discard mapped data */
763 pr_debug("discarding %zu bytes, current map len=%d", delta,
764 map_remaining);
765 if (delta) {
648ef4b8
MM
766 read_descriptor_t desc = {
767 .count = delta,
648ef4b8
MM
768 };
769 int ret;
770
bfae9dae 771 ret = tcp_read_sock(ssk, &desc, subflow_read_actor);
648ef4b8
MM
772 if (ret < 0) {
773 ssk->sk_err = -ret;
774 goto fatal;
775 }
776 if (ret < delta)
777 return false;
778 if (delta == map_remaining)
779 subflow->map_valid = 0;
780 }
781 }
782 return true;
783
784fatal:
785 /* fatal protocol error, close the socket */
786 /* This barrier is coupled with smp_rmb() in tcp_poll() */
787 smp_wmb();
788 ssk->sk_error_report(ssk);
789 tcp_set_state(ssk, TCP_CLOSE);
790 tcp_send_active_reset(ssk, GFP_ATOMIC);
791 return false;
792}
793
794bool mptcp_subflow_data_available(struct sock *sk)
795{
796 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
797 struct sk_buff *skb;
798
799 /* check if current mapping is still valid */
800 if (subflow->map_valid &&
801 mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
802 subflow->map_valid = 0;
803 subflow->data_avail = 0;
804
805 pr_debug("Done with mapping: seq=%u data_len=%u",
806 subflow->map_subflow_seq,
807 subflow->map_data_len);
808 }
809
810 if (!subflow_check_data_avail(sk)) {
811 subflow->data_avail = 0;
812 return false;
813 }
814
815 skb = skb_peek(&sk->sk_receive_queue);
816 subflow->data_avail = skb &&
817 before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq);
818 return subflow->data_avail;
819}
820
821static void subflow_data_ready(struct sock *sk)
822{
823 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
824 struct sock *parent = subflow->conn;
825
f296234c 826 if (!subflow->mp_capable && !subflow->mp_join) {
648ef4b8
MM
827 subflow->tcp_data_ready(sk);
828
dc093db5 829 parent->sk_data_ready(parent);
648ef4b8
MM
830 return;
831 }
832
101f6f85 833 if (mptcp_subflow_data_available(sk))
2e52213c 834 mptcp_data_ready(parent, sk);
648ef4b8
MM
835}
836
837static void subflow_write_space(struct sock *sk)
838{
839 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
840 struct sock *parent = subflow->conn;
841
842 sk_stream_write_space(sk);
dc093db5 843 if (sk_stream_is_writeable(sk)) {
1891c4a0
FW
844 set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
845 smp_mb__after_atomic();
846 /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
648ef4b8
MM
847 sk_stream_write_space(parent);
848 }
849}
850
cec37a6e
PK
851static struct inet_connection_sock_af_ops *
852subflow_default_af_ops(struct sock *sk)
853{
854#if IS_ENABLED(CONFIG_MPTCP_IPV6)
855 if (sk->sk_family == AF_INET6)
856 return &subflow_v6_specific;
857#endif
858 return &subflow_specific;
859}
860
cec37a6e 861#if IS_ENABLED(CONFIG_MPTCP_IPV6)
31484d56
GU
862void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
863{
cec37a6e
PK
864 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
865 struct inet_connection_sock *icsk = inet_csk(sk);
866 struct inet_connection_sock_af_ops *target;
867
868 target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
869
870 pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
edc7e489 871 subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
cec37a6e
PK
872
873 if (likely(icsk->icsk_af_ops == target))
874 return;
875
876 subflow->icsk_af_ops = icsk->icsk_af_ops;
877 icsk->icsk_af_ops = target;
cec37a6e 878}
31484d56 879#endif
cec37a6e 880
ec3edaa7
PK
881static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
882 struct sockaddr_storage *addr)
883{
884 memset(addr, 0, sizeof(*addr));
885 addr->ss_family = info->family;
886 if (addr->ss_family == AF_INET) {
887 struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
888
889 in_addr->sin_addr = info->addr;
890 in_addr->sin_port = info->port;
891 }
892#if IS_ENABLED(CONFIG_MPTCP_IPV6)
893 else if (addr->ss_family == AF_INET6) {
894 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
895
896 in6_addr->sin6_addr = info->addr6;
897 in6_addr->sin6_port = info->port;
898 }
899#endif
900}
901
902int __mptcp_subflow_connect(struct sock *sk, int ifindex,
903 const struct mptcp_addr_info *loc,
904 const struct mptcp_addr_info *remote)
905{
906 struct mptcp_sock *msk = mptcp_sk(sk);
907 struct mptcp_subflow_context *subflow;
908 struct sockaddr_storage addr;
909 struct socket *sf;
910 u32 remote_token;
911 int addrlen;
912 int err;
913
914 if (sk->sk_state != TCP_ESTABLISHED)
915 return -ENOTCONN;
916
917 err = mptcp_subflow_create_socket(sk, &sf);
918 if (err)
919 return err;
920
921 subflow = mptcp_subflow_ctx(sf->sk);
922 subflow->remote_key = msk->remote_key;
923 subflow->local_key = msk->local_key;
924 subflow->token = msk->token;
925 mptcp_info2sockaddr(loc, &addr);
926
927 addrlen = sizeof(struct sockaddr_in);
928#if IS_ENABLED(CONFIG_MPTCP_IPV6)
929 if (loc->family == AF_INET6)
930 addrlen = sizeof(struct sockaddr_in6);
931#endif
932 sf->sk->sk_bound_dev_if = ifindex;
933 err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
934 if (err)
935 goto failed;
936
937 mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
938 pr_debug("msk=%p remote_token=%u", msk, remote_token);
939 subflow->remote_token = remote_token;
940 subflow->local_id = loc->id;
941 subflow->request_join = 1;
942 subflow->request_bkup = 1;
943 mptcp_info2sockaddr(remote, &addr);
944
945 err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
946 if (err && err != -EINPROGRESS)
947 goto failed;
948
949 spin_lock_bh(&msk->join_list_lock);
950 list_add_tail(&subflow->node, &msk->join_list);
951 spin_unlock_bh(&msk->join_list_lock);
952
953 return err;
954
955failed:
956 sock_release(sf);
957 return err;
958}
959
2303f994
PK
960int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
961{
962 struct mptcp_subflow_context *subflow;
963 struct net *net = sock_net(sk);
964 struct socket *sf;
965 int err;
966
cec37a6e
PK
967 err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
968 &sf);
2303f994
PK
969 if (err)
970 return err;
971
972 lock_sock(sf->sk);
973
974 /* kernel sockets do not by default acquire net ref, but TCP timer
975 * needs it.
976 */
977 sf->sk->sk_net_refcnt = 1;
978 get_net(net);
f6f7d8cf 979#ifdef CONFIG_PROC_FS
2303f994 980 this_cpu_add(*net->core.sock_inuse, 1);
f6f7d8cf 981#endif
2303f994
PK
982 err = tcp_set_ulp(sf->sk, "mptcp");
983 release_sock(sf->sk);
984
985 if (err)
986 return err;
987
988 subflow = mptcp_subflow_ctx(sf->sk);
989 pr_debug("subflow=%p", subflow);
990
991 *new_sock = sf;
79c0949e 992 sock_hold(sk);
2303f994
PK
993 subflow->conn = sk;
994
995 return 0;
996}
997
998static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
999 gfp_t priority)
1000{
1001 struct inet_connection_sock *icsk = inet_csk(sk);
1002 struct mptcp_subflow_context *ctx;
1003
1004 ctx = kzalloc(sizeof(*ctx), priority);
1005 if (!ctx)
1006 return NULL;
1007
1008 rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
cec37a6e 1009 INIT_LIST_HEAD(&ctx->node);
2303f994
PK
1010
1011 pr_debug("subflow=%p", ctx);
1012
1013 ctx->tcp_sock = sk;
1014
1015 return ctx;
1016}
1017
648ef4b8
MM
1018static void __subflow_state_change(struct sock *sk)
1019{
1020 struct socket_wq *wq;
1021
1022 rcu_read_lock();
1023 wq = rcu_dereference(sk->sk_wq);
1024 if (skwq_has_sleeper(wq))
1025 wake_up_interruptible_all(&wq->wait);
1026 rcu_read_unlock();
1027}
1028
1029static bool subflow_is_done(const struct sock *sk)
1030{
1031 return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
1032}
1033
1034static void subflow_state_change(struct sock *sk)
1035{
1036 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
dc093db5 1037 struct sock *parent = subflow->conn;
648ef4b8
MM
1038
1039 __subflow_state_change(sk);
1040
1041 /* as recvmsg() does not acquire the subflow socket for ssk selection
1042 * a fin packet carrying a DSS can be unnoticed if we don't trigger
1043 * the data available machinery here.
1044 */
dc093db5 1045 if (subflow->mp_capable && mptcp_subflow_data_available(sk))
2e52213c 1046 mptcp_data_ready(parent, sk);
648ef4b8 1047
dc093db5 1048 if (!(parent->sk_shutdown & RCV_SHUTDOWN) &&
648ef4b8
MM
1049 !subflow->rx_eof && subflow_is_done(sk)) {
1050 subflow->rx_eof = 1;
59832e24 1051 mptcp_subflow_eof(parent);
648ef4b8
MM
1052 }
1053}
1054
2303f994
PK
1055static int subflow_ulp_init(struct sock *sk)
1056{
cec37a6e 1057 struct inet_connection_sock *icsk = inet_csk(sk);
2303f994
PK
1058 struct mptcp_subflow_context *ctx;
1059 struct tcp_sock *tp = tcp_sk(sk);
1060 int err = 0;
1061
1062 /* disallow attaching ULP to a socket unless it has been
1063 * created with sock_create_kern()
1064 */
1065 if (!sk->sk_kern_sock) {
1066 err = -EOPNOTSUPP;
1067 goto out;
1068 }
1069
1070 ctx = subflow_create_ctx(sk, GFP_KERNEL);
1071 if (!ctx) {
1072 err = -ENOMEM;
1073 goto out;
1074 }
1075
1076 pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
1077
1078 tp->is_mptcp = 1;
cec37a6e
PK
1079 ctx->icsk_af_ops = icsk->icsk_af_ops;
1080 icsk->icsk_af_ops = subflow_default_af_ops(sk);
648ef4b8
MM
1081 ctx->tcp_data_ready = sk->sk_data_ready;
1082 ctx->tcp_state_change = sk->sk_state_change;
1083 ctx->tcp_write_space = sk->sk_write_space;
1084 sk->sk_data_ready = subflow_data_ready;
1085 sk->sk_write_space = subflow_write_space;
1086 sk->sk_state_change = subflow_state_change;
2303f994
PK
1087out:
1088 return err;
1089}
1090
1091static void subflow_ulp_release(struct sock *sk)
1092{
1093 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk);
1094
1095 if (!ctx)
1096 return;
1097
79c0949e
PK
1098 if (ctx->conn)
1099 sock_put(ctx->conn);
1100
2303f994
PK
1101 kfree_rcu(ctx, rcu);
1102}
1103
cec37a6e
PK
1104static void subflow_ulp_clone(const struct request_sock *req,
1105 struct sock *newsk,
1106 const gfp_t priority)
1107{
1108 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
1109 struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
1110 struct mptcp_subflow_context *new_ctx;
1111
f296234c
PK
1112 if (!tcp_rsk(req)->is_mptcp ||
1113 (!subflow_req->mp_capable && !subflow_req->mp_join)) {
648ef4b8 1114 subflow_ulp_fallback(newsk, old_ctx);
cec37a6e
PK
1115 return;
1116 }
1117
1118 new_ctx = subflow_create_ctx(newsk, priority);
edc7e489 1119 if (!new_ctx) {
648ef4b8 1120 subflow_ulp_fallback(newsk, old_ctx);
cec37a6e
PK
1121 return;
1122 }
1123
1124 new_ctx->conn_finished = 1;
1125 new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
648ef4b8
MM
1126 new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
1127 new_ctx->tcp_state_change = old_ctx->tcp_state_change;
1128 new_ctx->tcp_write_space = old_ctx->tcp_write_space;
58b09919
PA
1129 new_ctx->rel_write_seq = 1;
1130 new_ctx->tcp_sock = newsk;
1131
f296234c
PK
1132 if (subflow_req->mp_capable) {
1133 /* see comments in subflow_syn_recv_sock(), MPTCP connection
1134 * is fully established only after we receive the remote key
1135 */
1136 new_ctx->mp_capable = 1;
1137 new_ctx->fully_established = subflow_req->remote_key_valid;
1138 new_ctx->can_ack = subflow_req->remote_key_valid;
1139 new_ctx->remote_key = subflow_req->remote_key;
1140 new_ctx->local_key = subflow_req->local_key;
1141 new_ctx->token = subflow_req->token;
1142 new_ctx->ssn_offset = subflow_req->ssn_offset;
1143 new_ctx->idsn = subflow_req->idsn;
1144 } else if (subflow_req->mp_join) {
ec3edaa7 1145 new_ctx->ssn_offset = subflow_req->ssn_offset;
f296234c
PK
1146 new_ctx->mp_join = 1;
1147 new_ctx->fully_established = 1;
1148 new_ctx->backup = subflow_req->backup;
1149 new_ctx->local_id = subflow_req->local_id;
1150 new_ctx->token = subflow_req->token;
1151 new_ctx->thmac = subflow_req->thmac;
1152 }
cec37a6e
PK
1153}
1154
2303f994
PK
1155static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
1156 .name = "mptcp",
1157 .owner = THIS_MODULE,
1158 .init = subflow_ulp_init,
1159 .release = subflow_ulp_release,
cec37a6e 1160 .clone = subflow_ulp_clone,
2303f994
PK
1161};
1162
cec37a6e
PK
1163static int subflow_ops_init(struct request_sock_ops *subflow_ops)
1164{
1165 subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
1166 subflow_ops->slab_name = "request_sock_subflow";
1167
1168 subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
1169 subflow_ops->obj_size, 0,
1170 SLAB_ACCOUNT |
1171 SLAB_TYPESAFE_BY_RCU,
1172 NULL);
1173 if (!subflow_ops->slab)
1174 return -ENOMEM;
1175
79c0949e
PK
1176 subflow_ops->destructor = subflow_req_destructor;
1177
cec37a6e
PK
1178 return 0;
1179}
1180
2303f994
PK
1181void mptcp_subflow_init(void)
1182{
cec37a6e
PK
1183 subflow_request_sock_ops = tcp_request_sock_ops;
1184 if (subflow_ops_init(&subflow_request_sock_ops) != 0)
1185 panic("MPTCP: failed to init subflow request sock ops\n");
1186
1187 subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
1188 subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req;
1189
1190 subflow_specific = ipv4_specific;
1191 subflow_specific.conn_request = subflow_v4_conn_request;
1192 subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
1193 subflow_specific.sk_rx_dst_set = subflow_finish_connect;
79c0949e 1194 subflow_specific.rebuild_header = subflow_rebuild_header;
cec37a6e
PK
1195
1196#if IS_ENABLED(CONFIG_MPTCP_IPV6)
1197 subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
1198 subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req;
1199
1200 subflow_v6_specific = ipv6_specific;
1201 subflow_v6_specific.conn_request = subflow_v6_conn_request;
1202 subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
1203 subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
79c0949e 1204 subflow_v6_specific.rebuild_header = subflow_rebuild_header;
cec37a6e
PK
1205
1206 subflow_v6m_specific = subflow_v6_specific;
1207 subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
1208 subflow_v6m_specific.send_check = ipv4_specific.send_check;
1209 subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
1210 subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
1211 subflow_v6m_specific.net_frag_header_len = 0;
1212#endif
1213
5147dfb5
DC
1214 mptcp_diag_subflow_init(&subflow_ulp_ops);
1215
2303f994
PK
1216 if (tcp_register_ulp(&subflow_ulp_ops) != 0)
1217 panic("MPTCP: failed to register subflows to ULP\n");
1218}