mptcp: cleanup mptcp_subflow_discard_data()
[linux-2.6-block.git] / net / mptcp / subflow.c
CommitLineData
2303f994
PK
1// SPDX-License-Identifier: GPL-2.0
2/* Multipath TCP
3 *
4 * Copyright (c) 2017 - 2019, Intel Corporation.
5 */
6
79c0949e
PK
7#define pr_fmt(fmt) "MPTCP: " fmt
8
2303f994
PK
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/netdevice.h>
f296234c 12#include <crypto/algapi.h>
bd697222 13#include <crypto/sha.h>
2303f994
PK
14#include <net/sock.h>
15#include <net/inet_common.h>
16#include <net/inet_hashtables.h>
17#include <net/protocol.h>
18#include <net/tcp.h>
cec37a6e
PK
19#if IS_ENABLED(CONFIG_MPTCP_IPV6)
20#include <net/ip6_route.h>
21#endif
2303f994
PK
22#include <net/mptcp.h>
23#include "protocol.h"
fc518953
FW
24#include "mib.h"
25
26static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
27 enum linux_mptcp_mib_field field)
28{
29 MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
30}
2303f994 31
79c0949e
PK
32static void subflow_req_destructor(struct request_sock *req)
33{
34 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
35
36 pr_debug("subflow_req=%p", subflow_req);
37
8fd4de12
PA
38 if (subflow_req->msk)
39 sock_put((struct sock *)subflow_req->msk);
40
2c5ebd00 41 mptcp_token_destroy_request(req);
79c0949e
PK
42 tcp_request_sock_ops.destructor(req);
43}
44
f296234c
PK
45static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
46 void *hmac)
47{
48 u8 msg[8];
49
50 put_unaligned_be32(nonce1, &msg[0]);
51 put_unaligned_be32(nonce2, &msg[4]);
52
53 mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
54}
55
4cf8b7e4
PA
56static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk)
57{
58 return mptcp_is_fully_established((void *)msk) &&
59 READ_ONCE(msk->pm.accept_subflow);
60}
61
f296234c 62/* validate received token and create truncated hmac and nonce for SYN-ACK */
8fd4de12
PA
63static struct mptcp_sock *subflow_token_join_request(struct request_sock *req,
64 const struct sk_buff *skb)
f296234c
PK
65{
66 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
bd697222 67 u8 hmac[SHA256_DIGEST_SIZE];
f296234c
PK
68 struct mptcp_sock *msk;
69 int local_id;
70
71 msk = mptcp_token_get_sock(subflow_req->token);
72 if (!msk) {
fc518953 73 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN);
8fd4de12 74 return NULL;
f296234c
PK
75 }
76
77 local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
78 if (local_id < 0) {
79 sock_put((struct sock *)msk);
8fd4de12 80 return NULL;
f296234c
PK
81 }
82 subflow_req->local_id = local_id;
83
84 get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
85
86 subflow_generate_hmac(msk->local_key, msk->remote_key,
87 subflow_req->local_nonce,
88 subflow_req->remote_nonce, hmac);
89
90 subflow_req->thmac = get_unaligned_be64(hmac);
8fd4de12 91 return msk;
f296234c
PK
92}
93
78d8b7bc 94static int __subflow_init_req(struct request_sock *req, const struct sock *sk_listener)
cec37a6e 95{
cec37a6e 96 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
cec37a6e
PK
97
98 subflow_req->mp_capable = 0;
f296234c 99 subflow_req->mp_join = 0;
8fd4de12 100 subflow_req->msk = NULL;
2c5ebd00 101 mptcp_token_init_request(req);
cec37a6e
PK
102
103#ifdef CONFIG_TCP_MD5SIG
104 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
105 * TCP option space.
106 */
107 if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
78d8b7bc 108 return -EINVAL;
cec37a6e
PK
109#endif
110
78d8b7bc
FW
111 return 0;
112}
113
114static void subflow_init_req(struct request_sock *req,
115 const struct sock *sk_listener,
116 struct sk_buff *skb)
117{
118 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
119 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
120 struct mptcp_options_received mp_opt;
121 int ret;
122
123 pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
124
125 ret = __subflow_init_req(req, sk_listener);
126 if (ret)
127 return;
128
129 mptcp_get_options(skb, &mp_opt);
130
cfde141e 131 if (mp_opt.mp_capable) {
fc518953
FW
132 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
133
cfde141e 134 if (mp_opt.mp_join)
fc518953 135 return;
cfde141e 136 } else if (mp_opt.mp_join) {
fc518953
FW
137 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
138 }
f296234c 139
cfde141e 140 if (mp_opt.mp_capable && listener->request_mptcp) {
535fb815
FW
141 int err, retries = 4;
142
c83a47e5 143 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
535fb815
FW
144again:
145 do {
146 get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key));
147 } while (subflow_req->local_key == 0);
79c0949e 148
c83a47e5
FW
149 if (unlikely(req->syncookie)) {
150 mptcp_crypto_key_sha(subflow_req->local_key,
151 &subflow_req->token,
152 &subflow_req->idsn);
153 if (mptcp_token_exists(subflow_req->token)) {
154 if (retries-- > 0)
155 goto again;
156 } else {
157 subflow_req->mp_capable = 1;
158 }
159 return;
160 }
161
79c0949e
PK
162 err = mptcp_token_new_request(req);
163 if (err == 0)
164 subflow_req->mp_capable = 1;
535fb815
FW
165 else if (retries-- > 0)
166 goto again;
79c0949e 167
cfde141e 168 } else if (mp_opt.mp_join && listener->request_mptcp) {
ec3edaa7 169 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
f296234c 170 subflow_req->mp_join = 1;
cfde141e
PA
171 subflow_req->backup = mp_opt.backup;
172 subflow_req->remote_id = mp_opt.join_id;
173 subflow_req->token = mp_opt.token;
174 subflow_req->remote_nonce = mp_opt.nonce;
8fd4de12 175 subflow_req->msk = subflow_token_join_request(req, skb);
9466a1cc
FW
176
177 if (unlikely(req->syncookie) && subflow_req->msk) {
178 if (mptcp_can_accept_new_subflow(subflow_req->msk))
179 subflow_init_req_cookie_join_save(subflow_req, skb);
180 }
181
8fd4de12
PA
182 pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
183 subflow_req->remote_nonce, subflow_req->msk);
cec37a6e
PK
184 }
185}
186
c83a47e5
FW
187int mptcp_subflow_init_cookie_req(struct request_sock *req,
188 const struct sock *sk_listener,
189 struct sk_buff *skb)
190{
191 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
192 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
193 struct mptcp_options_received mp_opt;
194 int err;
195
196 err = __subflow_init_req(req, sk_listener);
197 if (err)
198 return err;
199
200 mptcp_get_options(skb, &mp_opt);
201
202 if (mp_opt.mp_capable && mp_opt.mp_join)
203 return -EINVAL;
204
205 if (mp_opt.mp_capable && listener->request_mptcp) {
206 if (mp_opt.sndr_key == 0)
207 return -EINVAL;
208
209 subflow_req->local_key = mp_opt.rcvr_key;
210 err = mptcp_token_new_request(req);
211 if (err)
212 return err;
213
214 subflow_req->mp_capable = 1;
215 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
9466a1cc
FW
216 } else if (mp_opt.mp_join && listener->request_mptcp) {
217 if (!mptcp_token_join_cookie_init_state(subflow_req, skb))
218 return -EINVAL;
219
220 if (mptcp_can_accept_new_subflow(subflow_req->msk))
221 subflow_req->mp_join = 1;
222
223 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
c83a47e5
FW
224 }
225
226 return 0;
227}
228EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req);
229
cec37a6e
PK
230static void subflow_v4_init_req(struct request_sock *req,
231 const struct sock *sk_listener,
232 struct sk_buff *skb)
233{
234 tcp_rsk(req)->is_mptcp = 1;
235
236 tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb);
237
238 subflow_init_req(req, sk_listener, skb);
239}
240
241#if IS_ENABLED(CONFIG_MPTCP_IPV6)
242static void subflow_v6_init_req(struct request_sock *req,
243 const struct sock *sk_listener,
244 struct sk_buff *skb)
245{
246 tcp_rsk(req)->is_mptcp = 1;
247
248 tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb);
249
250 subflow_init_req(req, sk_listener, skb);
251}
252#endif
253
ec3edaa7
PK
254/* validate received truncated hmac and create hmac for third ACK */
255static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
256{
bd697222 257 u8 hmac[SHA256_DIGEST_SIZE];
ec3edaa7
PK
258 u64 thmac;
259
260 subflow_generate_hmac(subflow->remote_key, subflow->local_key,
261 subflow->remote_nonce, subflow->local_nonce,
262 hmac);
263
264 thmac = get_unaligned_be64(hmac);
265 pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
266 subflow, subflow->token,
267 (unsigned long long)thmac,
268 (unsigned long long)subflow->thmac);
269
270 return thmac == subflow->thmac;
271}
272
cec37a6e
PK
273static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
274{
275 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
cfde141e 276 struct mptcp_options_received mp_opt;
c3c123d1 277 struct sock *parent = subflow->conn;
cec37a6e
PK
278
279 subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
280
1200832c 281 if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
c3c123d1
DC
282 inet_sk_state_store(parent, TCP_ESTABLISHED);
283 parent->sk_state_change(parent);
284 }
285
263e1201
PA
286 /* be sure no special action on any packet other than syn-ack */
287 if (subflow->conn_finished)
288 return;
289
b0977bb2 290 subflow->rel_write_seq = 1;
263e1201 291 subflow->conn_finished = 1;
e1ff9e82
DC
292 subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
293 pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
263e1201 294
cfde141e 295 mptcp_get_options(skb, &mp_opt);
fa25e815
PA
296 if (subflow->request_mptcp) {
297 if (!mp_opt.mp_capable) {
298 MPTCP_INC_STATS(sock_net(sk),
299 MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
300 mptcp_do_fallback(sk);
301 pr_fallback(mptcp_sk(subflow->conn));
302 goto fallback;
303 }
304
263e1201
PA
305 subflow->mp_capable = 1;
306 subflow->can_ack = 1;
cfde141e 307 subflow->remote_key = mp_opt.sndr_key;
263e1201
PA
308 pr_debug("subflow=%p, remote_key=%llu", subflow,
309 subflow->remote_key);
fa25e815
PA
310 mptcp_finish_connect(sk);
311 } else if (subflow->request_join) {
312 u8 hmac[SHA256_DIGEST_SIZE];
313
314 if (!mp_opt.mp_join)
315 goto do_reset;
316
cfde141e
PA
317 subflow->thmac = mp_opt.thmac;
318 subflow->remote_nonce = mp_opt.nonce;
263e1201
PA
319 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
320 subflow->thmac, subflow->remote_nonce);
263e1201 321
ec3edaa7 322 if (!subflow_thmac_valid(subflow)) {
fc518953 323 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
ec3edaa7
PK
324 goto do_reset;
325 }
326
327 subflow_generate_hmac(subflow->local_key, subflow->remote_key,
328 subflow->local_nonce,
329 subflow->remote_nonce,
bd697222 330 hmac);
bd697222 331 memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);
ec3edaa7 332
ec3edaa7
PK
333 if (!mptcp_finish_join(sk))
334 goto do_reset;
335
fa25e815 336 subflow->mp_join = 1;
fc518953 337 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
fa25e815
PA
338 } else if (mptcp_check_fallback(sk)) {
339fallback:
340 mptcp_rcv_space_init(mptcp_sk(parent), sk);
cec37a6e 341 }
fa25e815
PA
342 return;
343
344do_reset:
345 tcp_send_active_reset(sk, GFP_ATOMIC);
346 tcp_done(sk);
cec37a6e
PK
347}
348
08b8d080
FW
349struct request_sock_ops mptcp_subflow_request_sock_ops;
350EXPORT_SYMBOL_GPL(mptcp_subflow_request_sock_ops);
cec37a6e
PK
351static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;
352
353static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
354{
355 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
356
357 pr_debug("subflow=%p", subflow);
358
359 /* Never answer to SYNs sent to broadcast or multicast */
360 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
361 goto drop;
362
08b8d080 363 return tcp_conn_request(&mptcp_subflow_request_sock_ops,
cec37a6e
PK
364 &subflow_request_sock_ipv4_ops,
365 sk, skb);
366drop:
367 tcp_listendrop(sk);
368 return 0;
369}
370
371#if IS_ENABLED(CONFIG_MPTCP_IPV6)
372static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
373static struct inet_connection_sock_af_ops subflow_v6_specific;
374static struct inet_connection_sock_af_ops subflow_v6m_specific;
375
376static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
377{
378 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
379
380 pr_debug("subflow=%p", subflow);
381
382 if (skb->protocol == htons(ETH_P_IP))
383 return subflow_v4_conn_request(sk, skb);
384
385 if (!ipv6_unicast_destination(skb))
386 goto drop;
387
08b8d080 388 return tcp_conn_request(&mptcp_subflow_request_sock_ops,
cec37a6e
PK
389 &subflow_request_sock_ipv6_ops, sk, skb);
390
391drop:
392 tcp_listendrop(sk);
393 return 0; /* don't send reset */
394}
395#endif
396
f296234c
PK
397/* validate hmac received in third ACK */
398static bool subflow_hmac_valid(const struct request_sock *req,
cfde141e 399 const struct mptcp_options_received *mp_opt)
f296234c
PK
400{
401 const struct mptcp_subflow_request_sock *subflow_req;
bd697222 402 u8 hmac[SHA256_DIGEST_SIZE];
f296234c 403 struct mptcp_sock *msk;
f296234c
PK
404
405 subflow_req = mptcp_subflow_rsk(req);
8fd4de12 406 msk = subflow_req->msk;
f296234c
PK
407 if (!msk)
408 return false;
409
410 subflow_generate_hmac(msk->remote_key, msk->local_key,
411 subflow_req->remote_nonce,
412 subflow_req->local_nonce, hmac);
413
8fd4de12 414 return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN);
f296234c
PK
415}
416
df1036da
FW
417static void mptcp_sock_destruct(struct sock *sk)
418{
419 /* if new mptcp socket isn't accepted, it is free'd
420 * from the tcp listener sockets request queue, linked
421 * from req->sk. The tcp socket is released.
422 * This calls the ULP release function which will
423 * also remove the mptcp socket, via
424 * sock_put(ctx->conn).
425 *
7ee24926
PA
426 * Problem is that the mptcp socket will be in
427 * ESTABLISHED state and will not have the SOCK_DEAD flag.
df1036da
FW
428 * Both result in warnings from inet_sock_destruct.
429 */
430
7ee24926 431 if (sk->sk_state == TCP_ESTABLISHED) {
df1036da
FW
432 sk->sk_state = TCP_CLOSE;
433 WARN_ON_ONCE(sk->sk_socket);
434 sock_orphan(sk);
435 }
436
ab174ad8 437 skb_rbtree_purge(&mptcp_sk(sk)->out_of_order_queue);
2c5ebd00 438 mptcp_token_destroy(mptcp_sk(sk));
df1036da
FW
439 inet_sock_destruct(sk);
440}
441
9f5ca6a5
FW
442static void mptcp_force_close(struct sock *sk)
443{
444 inet_sk_state_store(sk, TCP_CLOSE);
445 sk_common_release(sk);
446}
447
4c8941de
PA
448static void subflow_ulp_fallback(struct sock *sk,
449 struct mptcp_subflow_context *old_ctx)
450{
451 struct inet_connection_sock *icsk = inet_csk(sk);
452
453 mptcp_subflow_tcp_fallback(sk, old_ctx);
454 icsk->icsk_ulp_ops = NULL;
455 rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
456 tcp_sk(sk)->is_mptcp = 0;
457}
458
39884604
PA
459static void subflow_drop_ctx(struct sock *ssk)
460{
461 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
462
463 if (!ctx)
464 return;
465
466 subflow_ulp_fallback(ssk, ctx);
467 if (ctx->conn)
468 sock_put(ctx->conn);
469
470 kfree_rcu(ctx, rcu);
471}
472
b93df08c
PA
473void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
474 struct mptcp_options_received *mp_opt)
475{
476 struct mptcp_sock *msk = mptcp_sk(subflow->conn);
477
478 subflow->remote_key = mp_opt->sndr_key;
479 subflow->fully_established = 1;
480 subflow->can_ack = 1;
481 WRITE_ONCE(msk->fully_established, true);
482}
483
cec37a6e
PK
484static struct sock *subflow_syn_recv_sock(const struct sock *sk,
485 struct sk_buff *skb,
486 struct request_sock *req,
487 struct dst_entry *dst,
488 struct request_sock *req_unhash,
489 bool *own_req)
490{
491 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
cc7972ea 492 struct mptcp_subflow_request_sock *subflow_req;
cfde141e 493 struct mptcp_options_received mp_opt;
9e365ff5 494 bool fallback, fallback_is_fatal;
58b09919 495 struct sock *new_msk = NULL;
cec37a6e
PK
496 struct sock *child;
497
498 pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
499
9e365ff5
PA
500 /* After child creation we must look for 'mp_capable' even when options
501 * are not parsed
cfde141e
PA
502 */
503 mp_opt.mp_capable = 0;
9e365ff5
PA
504
505 /* hopefully temporary handling for MP_JOIN+syncookie */
506 subflow_req = mptcp_subflow_rsk(req);
b7514694 507 fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join;
9e365ff5
PA
508 fallback = !tcp_rsk(req)->is_mptcp;
509 if (fallback)
ae2dd716
FW
510 goto create_child;
511
d22f4988 512 /* if the sk is MP_CAPABLE, we try to fetch the client key */
cc7972ea 513 if (subflow_req->mp_capable) {
d22f4988
CP
514 if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) {
515 /* here we can receive and accept an in-window,
516 * out-of-order pkt, which will not carry the MP_CAPABLE
517 * opt even on mptcp enabled paths
518 */
58b09919 519 goto create_msk;
d22f4988
CP
520 }
521
cfde141e
PA
522 mptcp_get_options(skb, &mp_opt);
523 if (!mp_opt.mp_capable) {
4c8941de 524 fallback = true;
58b09919 525 goto create_child;
d22f4988 526 }
58b09919
PA
527
528create_msk:
cfde141e 529 new_msk = mptcp_sk_clone(listener->conn, &mp_opt, req);
58b09919 530 if (!new_msk)
4c8941de 531 fallback = true;
f296234c 532 } else if (subflow_req->mp_join) {
cfde141e
PA
533 mptcp_get_options(skb, &mp_opt);
534 if (!mp_opt.mp_join ||
4cf8b7e4 535 !mptcp_can_accept_new_subflow(subflow_req->msk) ||
cfde141e 536 !subflow_hmac_valid(req, &mp_opt)) {
fc518953 537 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
9e365ff5 538 fallback = true;
fc518953 539 }
cc7972ea 540 }
cec37a6e 541
d22f4988 542create_child:
cec37a6e
PK
543 child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
544 req_unhash, own_req);
545
546 if (child && *own_req) {
79c0949e
PK
547 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
548
90bf4513
PA
549 tcp_rsk(req)->drop_req = false;
550
4c8941de
PA
551 /* we need to fallback on ctx allocation failure and on pre-reqs
552 * checking above. In the latter scenario we additionally need
553 * to reset the context to non MPTCP status.
79c0949e 554 */
4c8941de 555 if (!ctx || fallback) {
f296234c 556 if (fallback_is_fatal)
729cd643 557 goto dispose_child;
4c8941de 558
39884604 559 subflow_drop_ctx(child);
58b09919 560 goto out;
f296234c 561 }
79c0949e
PK
562
563 if (ctx->mp_capable) {
b93df08c
PA
564 /* this can't race with mptcp_close(), as the msk is
565 * not yet exposted to user-space
566 */
567 inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED);
568
58b09919
PA
569 /* new mpc subflow takes ownership of the newly
570 * created mptcp socket
571 */
df1036da 572 new_msk->sk_destruct = mptcp_sock_destruct;
1b1c7a0e 573 mptcp_pm_new_connection(mptcp_sk(new_msk), 1);
2c5ebd00 574 mptcp_token_accept(subflow_req, mptcp_sk(new_msk));
58b09919
PA
575 ctx->conn = new_msk;
576 new_msk = NULL;
fca5c82c
PA
577
578 /* with OoO packets we can reach here without ingress
579 * mpc option
580 */
b93df08c
PA
581 if (mp_opt.mp_capable)
582 mptcp_subflow_fully_established(ctx, &mp_opt);
f296234c
PK
583 } else if (ctx->mp_join) {
584 struct mptcp_sock *owner;
585
8fd4de12 586 owner = subflow_req->msk;
f296234c 587 if (!owner)
729cd643 588 goto dispose_child;
f296234c 589
8fd4de12
PA
590 /* move the msk reference ownership to the subflow */
591 subflow_req->msk = NULL;
f296234c
PK
592 ctx->conn = (struct sock *)owner;
593 if (!mptcp_finish_join(child))
729cd643 594 goto dispose_child;
fc518953
FW
595
596 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
90bf4513 597 tcp_rsk(req)->drop_req = true;
cec37a6e
PK
598 }
599 }
600
58b09919
PA
601out:
602 /* dispose of the left over mptcp master, if any */
603 if (unlikely(new_msk))
9f5ca6a5 604 mptcp_force_close(new_msk);
4c8941de
PA
605
606 /* check for expected invariant - should never trigger, just help
607 * catching eariler subtle bugs
608 */
ac2b47fb 609 WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp &&
4c8941de
PA
610 (!mptcp_subflow_ctx(child) ||
611 !mptcp_subflow_ctx(child)->conn));
cec37a6e 612 return child;
f296234c 613
729cd643 614dispose_child:
39884604 615 subflow_drop_ctx(child);
729cd643 616 tcp_rsk(req)->drop_req = true;
729cd643 617 inet_csk_prepare_for_destroy_sock(child);
f296234c 618 tcp_done(child);
97e61751 619 req->rsk_ops->send_reset(sk, skb);
729cd643
PA
620
621 /* The last child reference will be released by the caller */
622 return child;
cec37a6e
PK
623}
624
625static struct inet_connection_sock_af_ops subflow_specific;
626
648ef4b8
MM
627enum mapping_status {
628 MAPPING_OK,
629 MAPPING_INVALID,
630 MAPPING_EMPTY,
e1ff9e82
DC
631 MAPPING_DATA_FIN,
632 MAPPING_DUMMY
648ef4b8
MM
633};
634
635static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
636{
637 if ((u32)seq == (u32)old_seq)
638 return old_seq;
639
640 /* Assume map covers data not mapped yet. */
641 return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32));
642}
643
644static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
645{
646 WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
647 ssn, subflow->map_subflow_seq, subflow->map_data_len);
648}
649
650static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
651{
652 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
653 unsigned int skb_consumed;
654
655 skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
656 if (WARN_ON_ONCE(skb_consumed >= skb->len))
657 return true;
658
659 return skb->len - skb_consumed <= subflow->map_data_len -
660 mptcp_subflow_get_map_offset(subflow);
661}
662
663static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
664{
665 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
666 u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
667
668 if (unlikely(before(ssn, subflow->map_subflow_seq))) {
669 /* Mapping covers data later in the subflow stream,
670 * currently unsupported.
671 */
672 warn_bad_map(subflow, ssn);
673 return false;
674 }
675 if (unlikely(!before(ssn, subflow->map_subflow_seq +
676 subflow->map_data_len))) {
677 /* Mapping does covers past subflow data, invalid */
678 warn_bad_map(subflow, ssn + skb->len);
679 return false;
680 }
681 return true;
682}
683
43b54c6e
MM
684static enum mapping_status get_mapping_status(struct sock *ssk,
685 struct mptcp_sock *msk)
648ef4b8
MM
686{
687 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
688 struct mptcp_ext *mpext;
689 struct sk_buff *skb;
690 u16 data_len;
691 u64 map_seq;
692
693 skb = skb_peek(&ssk->sk_receive_queue);
694 if (!skb)
695 return MAPPING_EMPTY;
696
e1ff9e82
DC
697 if (mptcp_check_fallback(ssk))
698 return MAPPING_DUMMY;
699
648ef4b8
MM
700 mpext = mptcp_get_ext(skb);
701 if (!mpext || !mpext->use_map) {
702 if (!subflow->map_valid && !skb->len) {
703 /* the TCP stack deliver 0 len FIN pkt to the receive
704 * queue, that is the only 0len pkts ever expected here,
705 * and we can admit no mapping only for 0 len pkts
706 */
707 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
708 WARN_ONCE(1, "0len seq %d:%d flags %x",
709 TCP_SKB_CB(skb)->seq,
710 TCP_SKB_CB(skb)->end_seq,
711 TCP_SKB_CB(skb)->tcp_flags);
712 sk_eat_skb(ssk, skb);
713 return MAPPING_EMPTY;
714 }
715
716 if (!subflow->map_valid)
717 return MAPPING_INVALID;
718
719 goto validate_seq;
720 }
721
722 pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
723 mpext->data_seq, mpext->dsn64, mpext->subflow_seq,
724 mpext->data_len, mpext->data_fin);
725
726 data_len = mpext->data_len;
727 if (data_len == 0) {
728 pr_err("Infinite mapping not handled");
fc518953 729 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
648ef4b8
MM
730 return MAPPING_INVALID;
731 }
732
733 if (mpext->data_fin == 1) {
734 if (data_len == 1) {
43b54c6e
MM
735 mptcp_update_rcv_data_fin(msk, mpext->data_seq);
736 pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq);
648ef4b8
MM
737 if (subflow->map_valid) {
738 /* A DATA_FIN might arrive in a DSS
739 * option before the previous mapping
740 * has been fully consumed. Continue
741 * handling the existing mapping.
742 */
743 skb_ext_del(skb, SKB_EXT_MPTCP);
744 return MAPPING_OK;
745 } else {
746 return MAPPING_DATA_FIN;
747 }
43b54c6e
MM
748 } else {
749 mptcp_update_rcv_data_fin(msk, mpext->data_seq + data_len);
750 pr_debug("DATA_FIN with mapping seq=%llu", mpext->data_seq + data_len);
648ef4b8
MM
751 }
752
753 /* Adjust for DATA_FIN using 1 byte of sequence space */
754 data_len--;
755 }
756
757 if (!mpext->dsn64) {
758 map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
759 mpext->data_seq);
a0c1d0ea 760 subflow->use_64bit_ack = 0;
648ef4b8
MM
761 pr_debug("expanded seq=%llu", subflow->map_seq);
762 } else {
763 map_seq = mpext->data_seq;
a0c1d0ea 764 subflow->use_64bit_ack = 1;
648ef4b8
MM
765 }
766
767 if (subflow->map_valid) {
768 /* Allow replacing only with an identical map */
769 if (subflow->map_seq == map_seq &&
770 subflow->map_subflow_seq == mpext->subflow_seq &&
771 subflow->map_data_len == data_len) {
772 skb_ext_del(skb, SKB_EXT_MPTCP);
773 return MAPPING_OK;
774 }
775
776 /* If this skb data are fully covered by the current mapping,
777 * the new map would need caching, which is not supported
778 */
fc518953
FW
779 if (skb_is_fully_mapped(ssk, skb)) {
780 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH);
648ef4b8 781 return MAPPING_INVALID;
fc518953 782 }
648ef4b8
MM
783
784 /* will validate the next map after consuming the current one */
785 return MAPPING_OK;
786 }
787
788 subflow->map_seq = map_seq;
789 subflow->map_subflow_seq = mpext->subflow_seq;
790 subflow->map_data_len = data_len;
791 subflow->map_valid = 1;
d22f4988 792 subflow->mpc_map = mpext->mpc_map;
648ef4b8
MM
793 pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
794 subflow->map_seq, subflow->map_subflow_seq,
795 subflow->map_data_len);
796
797validate_seq:
798 /* we revalidate valid mapping on new skb, because we must ensure
799 * the current skb is completely covered by the available mapping
800 */
801 if (!validate_mapping(ssk, skb))
802 return MAPPING_INVALID;
803
804 skb_ext_del(skb, SKB_EXT_MPTCP);
805 return MAPPING_OK;
806}
807
04e4cd4f
PA
808static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
809 unsigned int limit)
6719331c
PA
810{
811 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
04e4cd4f
PA
812 bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
813 u32 incr;
814
815 incr = limit >= skb->len ? skb->len + fin : limit;
816
817 pr_debug("discarding=%d len=%d seq=%d", incr, skb->len,
818 subflow->map_subflow_seq);
819 tcp_sk(ssk)->copied_seq += incr;
820 if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq))
821 sk_eat_skb(ssk, skb);
822 if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len)
823 subflow->map_valid = 0;
6719331c
PA
824}
825
648ef4b8
MM
826static bool subflow_check_data_avail(struct sock *ssk)
827{
828 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
829 enum mapping_status status;
830 struct mptcp_sock *msk;
831 struct sk_buff *skb;
832
833 pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
834 subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
47bebdf3
PA
835 if (!skb_peek(&ssk->sk_receive_queue))
836 subflow->data_avail = 0;
648ef4b8
MM
837 if (subflow->data_avail)
838 return true;
839
648ef4b8
MM
840 msk = mptcp_sk(subflow->conn);
841 for (;;) {
648ef4b8
MM
842 u64 ack_seq;
843 u64 old_ack;
844
43b54c6e 845 status = get_mapping_status(ssk, msk);
648ef4b8
MM
846 pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status);
847 if (status == MAPPING_INVALID) {
848 ssk->sk_err = EBADMSG;
849 goto fatal;
850 }
e1ff9e82
DC
851 if (status == MAPPING_DUMMY) {
852 __mptcp_do_fallback(msk);
853 skb = skb_peek(&ssk->sk_receive_queue);
854 subflow->map_valid = 1;
855 subflow->map_seq = READ_ONCE(msk->ack_seq);
856 subflow->map_data_len = skb->len;
857 subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq -
858 subflow->ssn_offset;
6719331c 859 subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
e1ff9e82
DC
860 return true;
861 }
648ef4b8
MM
862
863 if (status != MAPPING_OK)
864 return false;
865
866 skb = skb_peek(&ssk->sk_receive_queue);
867 if (WARN_ON_ONCE(!skb))
868 return false;
869
d22f4988
CP
870 /* if msk lacks the remote key, this subflow must provide an
871 * MP_CAPABLE-based mapping
872 */
873 if (unlikely(!READ_ONCE(msk->can_ack))) {
874 if (!subflow->mpc_map) {
875 ssk->sk_err = EBADMSG;
876 goto fatal;
877 }
878 WRITE_ONCE(msk->remote_key, subflow->remote_key);
879 WRITE_ONCE(msk->ack_seq, subflow->map_seq);
880 WRITE_ONCE(msk->can_ack, true);
881 }
882
648ef4b8
MM
883 old_ack = READ_ONCE(msk->ack_seq);
884 ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
885 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
886 ack_seq);
47bebdf3 887 if (ack_seq == old_ack) {
6719331c
PA
888 subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
889 break;
890 } else if (after64(ack_seq, old_ack)) {
891 subflow->data_avail = MPTCP_SUBFLOW_OOO_DATA;
648ef4b8 892 break;
47bebdf3 893 }
648ef4b8
MM
894
895 /* only accept in-sequence mapping. Old values are spurious
6719331c 896 * retransmission
648ef4b8 897 */
04e4cd4f 898 mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
648ef4b8
MM
899 }
900 return true;
901
902fatal:
903 /* fatal protocol error, close the socket */
904 /* This barrier is coupled with smp_rmb() in tcp_poll() */
905 smp_wmb();
906 ssk->sk_error_report(ssk);
907 tcp_set_state(ssk, TCP_CLOSE);
908 tcp_send_active_reset(ssk, GFP_ATOMIC);
47bebdf3 909 subflow->data_avail = 0;
648ef4b8
MM
910 return false;
911}
912
913bool mptcp_subflow_data_available(struct sock *sk)
914{
915 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
648ef4b8
MM
916
917 /* check if current mapping is still valid */
918 if (subflow->map_valid &&
919 mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
920 subflow->map_valid = 0;
921 subflow->data_avail = 0;
922
923 pr_debug("Done with mapping: seq=%u data_len=%u",
924 subflow->map_subflow_seq,
925 subflow->map_data_len);
926 }
927
47bebdf3 928 return subflow_check_data_avail(sk);
648ef4b8
MM
929}
930
071c8ed6
FW
931/* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
932 * not the ssk one.
933 *
934 * In mptcp, rwin is about the mptcp-level connection data.
935 *
936 * Data that is still on the ssk rx queue can thus be ignored,
937 * as far as mptcp peer is concerened that data is still inflight.
938 * DSS ACK is updated when skb is moved to the mptcp rx queue.
939 */
940void mptcp_space(const struct sock *ssk, int *space, int *full_space)
941{
942 const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
943 const struct sock *sk = subflow->conn;
944
945 *space = tcp_space(sk);
946 *full_space = tcp_full_space(sk);
947}
948
648ef4b8
MM
949static void subflow_data_ready(struct sock *sk)
950{
951 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
8c728940 952 u16 state = 1 << inet_sk_state_load(sk);
648ef4b8 953 struct sock *parent = subflow->conn;
e1ff9e82 954 struct mptcp_sock *msk;
648ef4b8 955
e1ff9e82 956 msk = mptcp_sk(parent);
8c728940 957 if (state & TCPF_LISTEN) {
e1ff9e82 958 set_bit(MPTCP_DATA_READY, &msk->flags);
dc093db5 959 parent->sk_data_ready(parent);
648ef4b8
MM
960 return;
961 }
962
e1ff9e82 963 WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
8c728940 964 !subflow->mp_join && !(state & TCPF_CLOSE));
e1ff9e82 965
101f6f85 966 if (mptcp_subflow_data_available(sk))
2e52213c 967 mptcp_data_ready(parent, sk);
648ef4b8
MM
968}
969
970static void subflow_write_space(struct sock *sk)
971{
972 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
973 struct sock *parent = subflow->conn;
974
63561a40
PA
975 if (!sk_stream_is_writeable(sk))
976 return;
977
978 if (sk_stream_is_writeable(parent)) {
1891c4a0
FW
979 set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
980 smp_mb__after_atomic();
981 /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
648ef4b8
MM
982 sk_stream_write_space(parent);
983 }
984}
985
cec37a6e
PK
986static struct inet_connection_sock_af_ops *
987subflow_default_af_ops(struct sock *sk)
988{
989#if IS_ENABLED(CONFIG_MPTCP_IPV6)
990 if (sk->sk_family == AF_INET6)
991 return &subflow_v6_specific;
992#endif
993 return &subflow_specific;
994}
995
cec37a6e 996#if IS_ENABLED(CONFIG_MPTCP_IPV6)
31484d56
GU
997void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
998{
cec37a6e
PK
999 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
1000 struct inet_connection_sock *icsk = inet_csk(sk);
1001 struct inet_connection_sock_af_ops *target;
1002
1003 target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
1004
1005 pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
edc7e489 1006 subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
cec37a6e
PK
1007
1008 if (likely(icsk->icsk_af_ops == target))
1009 return;
1010
1011 subflow->icsk_af_ops = icsk->icsk_af_ops;
1012 icsk->icsk_af_ops = target;
cec37a6e 1013}
31484d56 1014#endif
cec37a6e 1015
ec3edaa7
PK
1016static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
1017 struct sockaddr_storage *addr)
1018{
1019 memset(addr, 0, sizeof(*addr));
1020 addr->ss_family = info->family;
1021 if (addr->ss_family == AF_INET) {
1022 struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
1023
1024 in_addr->sin_addr = info->addr;
1025 in_addr->sin_port = info->port;
1026 }
1027#if IS_ENABLED(CONFIG_MPTCP_IPV6)
1028 else if (addr->ss_family == AF_INET6) {
1029 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
1030
1031 in6_addr->sin6_addr = info->addr6;
1032 in6_addr->sin6_port = info->port;
1033 }
1034#endif
1035}
1036
1037int __mptcp_subflow_connect(struct sock *sk, int ifindex,
1038 const struct mptcp_addr_info *loc,
1039 const struct mptcp_addr_info *remote)
1040{
1041 struct mptcp_sock *msk = mptcp_sk(sk);
1042 struct mptcp_subflow_context *subflow;
1043 struct sockaddr_storage addr;
6bad912b 1044 int local_id = loc->id;
ec3edaa7 1045 struct socket *sf;
6bad912b 1046 struct sock *ssk;
ec3edaa7
PK
1047 u32 remote_token;
1048 int addrlen;
1049 int err;
1050
b93df08c 1051 if (!mptcp_is_fully_established(sk))
ec3edaa7
PK
1052 return -ENOTCONN;
1053
1054 err = mptcp_subflow_create_socket(sk, &sf);
1055 if (err)
1056 return err;
1057
6bad912b
PA
1058 ssk = sf->sk;
1059 subflow = mptcp_subflow_ctx(ssk);
1060 do {
1061 get_random_bytes(&subflow->local_nonce, sizeof(u32));
1062 } while (!subflow->local_nonce);
1063
1064 if (!local_id) {
1065 err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk);
1066 if (err < 0)
1067 goto failed;
1068
1069 local_id = err;
1070 }
1071
ec3edaa7
PK
1072 subflow->remote_key = msk->remote_key;
1073 subflow->local_key = msk->local_key;
1074 subflow->token = msk->token;
1075 mptcp_info2sockaddr(loc, &addr);
1076
1077 addrlen = sizeof(struct sockaddr_in);
1078#if IS_ENABLED(CONFIG_MPTCP_IPV6)
1079 if (loc->family == AF_INET6)
1080 addrlen = sizeof(struct sockaddr_in6);
1081#endif
6bad912b 1082 ssk->sk_bound_dev_if = ifindex;
ec3edaa7
PK
1083 err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
1084 if (err)
1085 goto failed;
1086
1087 mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
6bad912b
PA
1088 pr_debug("msk=%p remote_token=%u local_id=%d", msk, remote_token,
1089 local_id);
ec3edaa7 1090 subflow->remote_token = remote_token;
6bad912b 1091 subflow->local_id = local_id;
ec3edaa7
PK
1092 subflow->request_join = 1;
1093 subflow->request_bkup = 1;
1094 mptcp_info2sockaddr(remote, &addr);
1095
1096 err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
1097 if (err && err != -EINPROGRESS)
1098 goto failed;
1099
1100 spin_lock_bh(&msk->join_list_lock);
1101 list_add_tail(&subflow->node, &msk->join_list);
1102 spin_unlock_bh(&msk->join_list_lock);
1103
1104 return err;
1105
1106failed:
1107 sock_release(sf);
1108 return err;
1109}
1110
2303f994
PK
1111int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
1112{
1113 struct mptcp_subflow_context *subflow;
1114 struct net *net = sock_net(sk);
1115 struct socket *sf;
1116 int err;
1117
adf73410
PA
1118 /* un-accepted server sockets can reach here - on bad configuration
1119 * bail early to avoid greater trouble later
1120 */
1121 if (unlikely(!sk->sk_socket))
1122 return -EINVAL;
1123
cec37a6e
PK
1124 err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
1125 &sf);
2303f994
PK
1126 if (err)
1127 return err;
1128
1129 lock_sock(sf->sk);
1130
1131 /* kernel sockets do not by default acquire net ref, but TCP timer
1132 * needs it.
1133 */
1134 sf->sk->sk_net_refcnt = 1;
1135 get_net(net);
f6f7d8cf 1136#ifdef CONFIG_PROC_FS
2303f994 1137 this_cpu_add(*net->core.sock_inuse, 1);
f6f7d8cf 1138#endif
2303f994
PK
1139 err = tcp_set_ulp(sf->sk, "mptcp");
1140 release_sock(sf->sk);
1141
b8ad540d
WY
1142 if (err) {
1143 sock_release(sf);
2303f994 1144 return err;
b8ad540d 1145 }
2303f994 1146
7d14b0d2
PA
1147 /* the newly created socket really belongs to the owning MPTCP master
1148 * socket, even if for additional subflows the allocation is performed
1149 * by a kernel workqueue. Adjust inode references, so that the
1150 * procfs/diag interaces really show this one belonging to the correct
1151 * user.
1152 */
1153 SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino;
1154 SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid;
1155 SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid;
1156
2303f994
PK
1157 subflow = mptcp_subflow_ctx(sf->sk);
1158 pr_debug("subflow=%p", subflow);
1159
1160 *new_sock = sf;
79c0949e 1161 sock_hold(sk);
2303f994
PK
1162 subflow->conn = sk;
1163
1164 return 0;
1165}
1166
1167static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
1168 gfp_t priority)
1169{
1170 struct inet_connection_sock *icsk = inet_csk(sk);
1171 struct mptcp_subflow_context *ctx;
1172
1173 ctx = kzalloc(sizeof(*ctx), priority);
1174 if (!ctx)
1175 return NULL;
1176
1177 rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
cec37a6e 1178 INIT_LIST_HEAD(&ctx->node);
2303f994
PK
1179
1180 pr_debug("subflow=%p", ctx);
1181
1182 ctx->tcp_sock = sk;
1183
1184 return ctx;
1185}
1186
648ef4b8
MM
1187static void __subflow_state_change(struct sock *sk)
1188{
1189 struct socket_wq *wq;
1190
1191 rcu_read_lock();
1192 wq = rcu_dereference(sk->sk_wq);
1193 if (skwq_has_sleeper(wq))
1194 wake_up_interruptible_all(&wq->wait);
1195 rcu_read_unlock();
1196}
1197
1198static bool subflow_is_done(const struct sock *sk)
1199{
1200 return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
1201}
1202
1203static void subflow_state_change(struct sock *sk)
1204{
1205 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
dc093db5 1206 struct sock *parent = subflow->conn;
648ef4b8
MM
1207
1208 __subflow_state_change(sk);
1209
8fd73804
DC
1210 if (subflow_simultaneous_connect(sk)) {
1211 mptcp_do_fallback(sk);
a6b118fe 1212 mptcp_rcv_space_init(mptcp_sk(parent), sk);
8fd73804
DC
1213 pr_fallback(mptcp_sk(parent));
1214 subflow->conn_finished = 1;
1215 if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
1216 inet_sk_state_store(parent, TCP_ESTABLISHED);
1217 parent->sk_state_change(parent);
1218 }
1219 }
1220
648ef4b8
MM
1221 /* as recvmsg() does not acquire the subflow socket for ssk selection
1222 * a fin packet carrying a DSS can be unnoticed if we don't trigger
1223 * the data available machinery here.
1224 */
e1ff9e82 1225 if (mptcp_subflow_data_available(sk))
2e52213c 1226 mptcp_data_ready(parent, sk);
648ef4b8 1227
067a0b3d
MM
1228 if (__mptcp_check_fallback(mptcp_sk(parent)) &&
1229 !(parent->sk_shutdown & RCV_SHUTDOWN) &&
648ef4b8
MM
1230 !subflow->rx_eof && subflow_is_done(sk)) {
1231 subflow->rx_eof = 1;
59832e24 1232 mptcp_subflow_eof(parent);
648ef4b8
MM
1233 }
1234}
1235
2303f994
PK
1236static int subflow_ulp_init(struct sock *sk)
1237{
cec37a6e 1238 struct inet_connection_sock *icsk = inet_csk(sk);
2303f994
PK
1239 struct mptcp_subflow_context *ctx;
1240 struct tcp_sock *tp = tcp_sk(sk);
1241 int err = 0;
1242
1243 /* disallow attaching ULP to a socket unless it has been
1244 * created with sock_create_kern()
1245 */
1246 if (!sk->sk_kern_sock) {
1247 err = -EOPNOTSUPP;
1248 goto out;
1249 }
1250
1251 ctx = subflow_create_ctx(sk, GFP_KERNEL);
1252 if (!ctx) {
1253 err = -ENOMEM;
1254 goto out;
1255 }
1256
1257 pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
1258
1259 tp->is_mptcp = 1;
cec37a6e
PK
1260 ctx->icsk_af_ops = icsk->icsk_af_ops;
1261 icsk->icsk_af_ops = subflow_default_af_ops(sk);
648ef4b8
MM
1262 ctx->tcp_data_ready = sk->sk_data_ready;
1263 ctx->tcp_state_change = sk->sk_state_change;
1264 ctx->tcp_write_space = sk->sk_write_space;
1265 sk->sk_data_ready = subflow_data_ready;
1266 sk->sk_write_space = subflow_write_space;
1267 sk->sk_state_change = subflow_state_change;
2303f994
PK
1268out:
1269 return err;
1270}
1271
1272static void subflow_ulp_release(struct sock *sk)
1273{
1274 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk);
1275
1276 if (!ctx)
1277 return;
1278
79c0949e
PK
1279 if (ctx->conn)
1280 sock_put(ctx->conn);
1281
2303f994
PK
1282 kfree_rcu(ctx, rcu);
1283}
1284
cec37a6e
PK
1285static void subflow_ulp_clone(const struct request_sock *req,
1286 struct sock *newsk,
1287 const gfp_t priority)
1288{
1289 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
1290 struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
1291 struct mptcp_subflow_context *new_ctx;
1292
f296234c
PK
1293 if (!tcp_rsk(req)->is_mptcp ||
1294 (!subflow_req->mp_capable && !subflow_req->mp_join)) {
648ef4b8 1295 subflow_ulp_fallback(newsk, old_ctx);
cec37a6e
PK
1296 return;
1297 }
1298
1299 new_ctx = subflow_create_ctx(newsk, priority);
edc7e489 1300 if (!new_ctx) {
648ef4b8 1301 subflow_ulp_fallback(newsk, old_ctx);
cec37a6e
PK
1302 return;
1303 }
1304
1305 new_ctx->conn_finished = 1;
1306 new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
648ef4b8
MM
1307 new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
1308 new_ctx->tcp_state_change = old_ctx->tcp_state_change;
1309 new_ctx->tcp_write_space = old_ctx->tcp_write_space;
58b09919
PA
1310 new_ctx->rel_write_seq = 1;
1311 new_ctx->tcp_sock = newsk;
1312
f296234c
PK
1313 if (subflow_req->mp_capable) {
1314 /* see comments in subflow_syn_recv_sock(), MPTCP connection
1315 * is fully established only after we receive the remote key
1316 */
1317 new_ctx->mp_capable = 1;
f296234c
PK
1318 new_ctx->local_key = subflow_req->local_key;
1319 new_ctx->token = subflow_req->token;
1320 new_ctx->ssn_offset = subflow_req->ssn_offset;
1321 new_ctx->idsn = subflow_req->idsn;
1322 } else if (subflow_req->mp_join) {
ec3edaa7 1323 new_ctx->ssn_offset = subflow_req->ssn_offset;
f296234c
PK
1324 new_ctx->mp_join = 1;
1325 new_ctx->fully_established = 1;
1326 new_ctx->backup = subflow_req->backup;
1327 new_ctx->local_id = subflow_req->local_id;
1328 new_ctx->token = subflow_req->token;
1329 new_ctx->thmac = subflow_req->thmac;
1330 }
cec37a6e
PK
1331}
1332
2303f994
PK
1333static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
1334 .name = "mptcp",
1335 .owner = THIS_MODULE,
1336 .init = subflow_ulp_init,
1337 .release = subflow_ulp_release,
cec37a6e 1338 .clone = subflow_ulp_clone,
2303f994
PK
1339};
1340
cec37a6e
PK
1341static int subflow_ops_init(struct request_sock_ops *subflow_ops)
1342{
1343 subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
1344 subflow_ops->slab_name = "request_sock_subflow";
1345
1346 subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
1347 subflow_ops->obj_size, 0,
1348 SLAB_ACCOUNT |
1349 SLAB_TYPESAFE_BY_RCU,
1350 NULL);
1351 if (!subflow_ops->slab)
1352 return -ENOMEM;
1353
79c0949e
PK
1354 subflow_ops->destructor = subflow_req_destructor;
1355
cec37a6e
PK
1356 return 0;
1357}
1358
d39dceca 1359void __init mptcp_subflow_init(void)
2303f994 1360{
08b8d080
FW
1361 mptcp_subflow_request_sock_ops = tcp_request_sock_ops;
1362 if (subflow_ops_init(&mptcp_subflow_request_sock_ops) != 0)
cec37a6e
PK
1363 panic("MPTCP: failed to init subflow request sock ops\n");
1364
1365 subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
1366 subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req;
1367
1368 subflow_specific = ipv4_specific;
1369 subflow_specific.conn_request = subflow_v4_conn_request;
1370 subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
1371 subflow_specific.sk_rx_dst_set = subflow_finish_connect;
1372
1373#if IS_ENABLED(CONFIG_MPTCP_IPV6)
1374 subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
1375 subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req;
1376
1377 subflow_v6_specific = ipv6_specific;
1378 subflow_v6_specific.conn_request = subflow_v6_conn_request;
1379 subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
1380 subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
1381
1382 subflow_v6m_specific = subflow_v6_specific;
1383 subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
1384 subflow_v6m_specific.send_check = ipv4_specific.send_check;
1385 subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
1386 subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
1387 subflow_v6m_specific.net_frag_header_len = 0;
1388#endif
1389
5147dfb5
DC
1390 mptcp_diag_subflow_init(&subflow_ulp_ops);
1391
2303f994
PK
1392 if (tcp_register_ulp(&subflow_ulp_ops) != 0)
1393 panic("MPTCP: failed to register subflows to ULP\n");
1394}