tcp: Don't allocate tcp_death_row outside of struct netns_ipv4.
[linux-block.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
1da177e4
LT
9 * IPv4 specific functions
10 *
1da177e4
LT
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
1da177e4
LT
17 */
18
19/*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
60236fdd 32 * request_sock handling and moved
1da177e4
LT
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
caa20d9a 35 * Added new listen semantics.
1da177e4
LT
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
afd46503 48#define pr_fmt(fmt) "TCP: " fmt
1da177e4 49
eb4dea58 50#include <linux/bottom_half.h>
1da177e4
LT
51#include <linux/types.h>
52#include <linux/fcntl.h>
53#include <linux/module.h>
54#include <linux/random.h>
55#include <linux/cache.h>
56#include <linux/jhash.h>
57#include <linux/init.h>
58#include <linux/times.h>
5a0e3ad6 59#include <linux/slab.h>
1da177e4 60
457c4cbc 61#include <net/net_namespace.h>
1da177e4 62#include <net/icmp.h>
304a1618 63#include <net/inet_hashtables.h>
1da177e4 64#include <net/tcp.h>
20380731 65#include <net/transp_v6.h>
1da177e4
LT
66#include <net/ipv6.h>
67#include <net/inet_common.h>
6d6ee43e 68#include <net/timewait_sock.h>
1da177e4 69#include <net/xfrm.h>
6e5714ea 70#include <net/secure_seq.h>
076bb0c8 71#include <net/busy_poll.h>
1da177e4
LT
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
6797318e 78#include <linux/inetdevice.h>
951cf368 79#include <linux/btf_ids.h>
1da177e4 80
cf80e0e4 81#include <crypto/hash.h>
cfb6eeb4
YH
82#include <linux/scatterlist.h>
83
c24b14c4
SL
84#include <trace/events/tcp.h>
85
cfb6eeb4 86#ifdef CONFIG_TCP_MD5SIG
a915da9b 87static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
89#endif
90
5caea4ea 91struct inet_hashinfo tcp_hashinfo;
4bc2f18b 92EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 93
37ba017d
ED
94static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95
84b114b9 96static u32 tcp_v4_init_seq(const struct sk_buff *skb)
1da177e4 97{
84b114b9
ED
98 return secure_tcp_seq(ip_hdr(skb)->daddr,
99 ip_hdr(skb)->saddr,
100 tcp_hdr(skb)->dest,
101 tcp_hdr(skb)->source);
102}
103
5d2ed052 104static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
84b114b9 105{
5d2ed052 106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
1da177e4
LT
107}
108
6d6ee43e
ACM
109int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110{
cbfc6495 111 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
79e9fed4 112 const struct inet_timewait_sock *tw = inet_twsk(sktw);
6d6ee43e
ACM
113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 struct tcp_sock *tp = tcp_sk(sk);
79e9fed4
115
116 if (reuse == 2) {
117 /* Still does not detect *everything* that goes through
118 * lo, since we require a loopback src or dst address
119 * or direct binding to 'lo' interface.
120 */
121 bool loopback = false;
122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123 loopback = true;
124#if IS_ENABLED(CONFIG_IPV6)
125 if (tw->tw_family == AF_INET6) {
126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
be2644aa 127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
79e9fed4 128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
be2644aa 129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
79e9fed4
130 loopback = true;
131 } else
132#endif
133 {
134 if (ipv4_is_loopback(tw->tw_daddr) ||
135 ipv4_is_loopback(tw->tw_rcv_saddr))
136 loopback = true;
137 }
138 if (!loopback)
139 reuse = 0;
140 }
6d6ee43e
ACM
141
142 /* With PAWS, it is safe from the viewpoint
143 of data integrity. Even without PAWS it is safe provided sequence
144 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145
146 Actually, the idea is close to VJ's one, only timestamp cache is
147 held not per host, but per port pair and TW bucket is used as state
148 holder.
149
150 If TW bucket has been already destroyed we fall back to VJ's scheme
151 and use initial timestamp retrieved from peer table.
152 */
153 if (tcptw->tw_ts_recent_stamp &&
cca9bab1
AB
154 (!twp || (reuse && time_after32(ktime_get_seconds(),
155 tcptw->tw_ts_recent_stamp)))) {
21684dc4
SB
156 /* In case of repair and re-using TIME-WAIT sockets we still
157 * want to be sure that it is safe as above but honor the
158 * sequence numbers and time stamps set as part of the repair
159 * process.
160 *
161 * Without this check re-using a TIME-WAIT socket with TCP
162 * repair would accumulate a -1 on the repair assigned
163 * sequence number. The first time it is reused the sequence
164 * is -1, the second time -2, etc. This fixes that issue
165 * without appearing to create any others.
166 */
167 if (likely(!tp->repair)) {
0f317464
ED
168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169
170 if (!seq)
171 seq = 1;
172 WRITE_ONCE(tp->write_seq, seq);
21684dc4
SB
173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175 }
6d6ee43e
ACM
176 sock_hold(sktw);
177 return 1;
178 }
179
180 return 0;
181}
6d6ee43e
ACM
182EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183
d74bad4e
AI
184static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 int addr_len)
186{
187 /* This check is replicated from tcp_v4_connect() and intended to
188 * prevent BPF program called below from accessing bytes that are out
189 * of the bound specified by user in addr_len.
190 */
191 if (addr_len < sizeof(struct sockaddr_in))
192 return -EINVAL;
193
194 sock_owned_by_me(sk);
195
196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197}
198
1da177e4
LT
199/* This will initiate an outgoing connection. */
200int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201{
28044fc1 202 struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
2d7192d6 203 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
08eaef90 204 struct inet_timewait_death_row *tcp_death_row;
28044fc1 205 __be32 daddr, nexthop, prev_sk_rcv_saddr;
1da177e4
LT
206 struct inet_sock *inet = inet_sk(sk);
207 struct tcp_sock *tp = tcp_sk(sk);
08eaef90
KI
208 struct ip_options_rcu *inet_opt;
209 struct net *net = sock_net(sk);
dca8b089 210 __be16 orig_sport, orig_dport;
da905bd1 211 struct flowi4 *fl4;
2d7192d6 212 struct rtable *rt;
1da177e4
LT
213 int err;
214
215 if (addr_len < sizeof(struct sockaddr_in))
216 return -EINVAL;
217
218 if (usin->sin_family != AF_INET)
219 return -EAFNOSUPPORT;
220
221 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05 222 inet_opt = rcu_dereference_protected(inet->inet_opt,
1e1d04e6 223 lockdep_sock_is_held(sk));
f6d8bd05 224 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
225 if (!daddr)
226 return -EINVAL;
f6d8bd05 227 nexthop = inet_opt->opt.faddr;
1da177e4
LT
228 }
229
dca8b089
DM
230 orig_sport = inet->inet_sport;
231 orig_dport = usin->sin_port;
da905bd1
DM
232 fl4 = &inet->cork.fl.u.ip4;
233 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
67e1e2f4
GN
234 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
235 orig_dport, sk);
b23dd4fe
DM
236 if (IS_ERR(rt)) {
237 err = PTR_ERR(rt);
238 if (err == -ENETUNREACH)
08eaef90 239 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 240 return err;
584bdf8c 241 }
1da177e4
LT
242
243 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
244 ip_rt_put(rt);
245 return -ENETUNREACH;
246 }
247
f6d8bd05 248 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 249 daddr = fl4->daddr;
1da177e4 250
28044fc1
JK
251 if (!inet->inet_saddr) {
252 if (inet_csk(sk)->icsk_bind2_hash) {
253 prev_addr_hashbucket = inet_bhashfn_portaddr(&tcp_hashinfo,
08eaef90 254 sk, net, inet->inet_num);
28044fc1
JK
255 prev_sk_rcv_saddr = sk->sk_rcv_saddr;
256 }
da905bd1 257 inet->inet_saddr = fl4->saddr;
28044fc1
JK
258 }
259
d1e559d0 260 sk_rcv_saddr_set(sk, inet->inet_saddr);
1da177e4 261
28044fc1
JK
262 if (prev_addr_hashbucket) {
263 err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
264 if (err) {
265 inet->inet_saddr = 0;
266 sk_rcv_saddr_set(sk, prev_sk_rcv_saddr);
267 ip_rt_put(rt);
268 return err;
269 }
270 }
271
c720c7e8 272 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
273 /* Reset inherited state */
274 tp->rx_opt.ts_recent = 0;
275 tp->rx_opt.ts_recent_stamp = 0;
ee995283 276 if (likely(!tp->repair))
0f317464 277 WRITE_ONCE(tp->write_seq, 0);
1da177e4
LT
278 }
279
c720c7e8 280 inet->inet_dport = usin->sin_port;
d1e559d0 281 sk_daddr_set(sk, daddr);
1da177e4 282
d83d8461 283 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
284 if (inet_opt)
285 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 286
bee7ca9e 287 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
288
289 /* Socket identity is still unknown (sport may be zero).
290 * However we set state to SYN-SENT and not releasing socket
291 * lock select source port, enter ourselves into the hash tables and
292 * complete initialization after this.
293 */
294 tcp_set_state(sk, TCP_SYN_SENT);
e9bd0cca 295 tcp_death_row = &net->ipv4.tcp_death_row;
1946e672 296 err = inet_hash_connect(tcp_death_row, sk);
1da177e4
LT
297 if (err)
298 goto failure;
299
877d1f62 300 sk_set_txhash(sk);
9e7ceb06 301
da905bd1 302 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
303 inet->inet_sport, inet->inet_dport, sk);
304 if (IS_ERR(rt)) {
305 err = PTR_ERR(rt);
306 rt = NULL;
1da177e4 307 goto failure;
b23dd4fe 308 }
1da177e4 309 /* OK, now commit destination to socket. */
bcd76111 310 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 311 sk_setup_caps(sk, &rt->dst);
19f6d3f3 312 rt = NULL;
1da177e4 313
00355fa5 314 if (likely(!tp->repair)) {
00355fa5 315 if (!tp->write_seq)
0f317464
ED
316 WRITE_ONCE(tp->write_seq,
317 secure_tcp_seq(inet->inet_saddr,
318 inet->inet_daddr,
319 inet->inet_sport,
320 usin->sin_port));
08eaef90 321 tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr,
84b114b9 322 inet->inet_daddr);
00355fa5 323 }
1da177e4 324
a904a069 325 inet->inet_id = prandom_u32();
1da177e4 326
19f6d3f3
WW
327 if (tcp_fastopen_defer_connect(sk, &err))
328 return err;
329 if (err)
330 goto failure;
331
2b916477 332 err = tcp_connect(sk);
ee995283 333
1da177e4
LT
334 if (err)
335 goto failure;
336
337 return 0;
338
339failure:
7174259e
ACM
340 /*
341 * This unhashes the socket and releases the local port,
342 * if necessary.
343 */
1da177e4
LT
344 tcp_set_state(sk, TCP_CLOSE);
345 ip_rt_put(rt);
346 sk->sk_route_caps = 0;
c720c7e8 347 inet->inet_dport = 0;
1da177e4
LT
348 return err;
349}
4bc2f18b 350EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 351
1da177e4 352/*
563d34d0
ED
353 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
354 * It can be called through tcp_release_cb() if socket was owned by user
355 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 356 */
4fab9071 357void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4 358{
1da177e4 359 struct inet_sock *inet = inet_sk(sk);
02b2faaf
ED
360 struct dst_entry *dst;
361 u32 mtu;
1da177e4 362
02b2faaf
ED
363 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
364 return;
561022ac 365 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
80d0a69f
DM
366 dst = inet_csk_update_pmtu(sk, mtu);
367 if (!dst)
1da177e4
LT
368 return;
369
1da177e4
LT
370 /* Something is about to be wrong... Remember soft error
371 * for the case, if this connection will not able to recover.
372 */
373 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
374 sk->sk_err_soft = EMSGSIZE;
375
376 mtu = dst_mtu(dst);
377
378 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
482fc609 379 ip_sk_accept_pmtu(sk) &&
d83d8461 380 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
381 tcp_sync_mss(sk, mtu);
382
383 /* Resend the TCP packet because it's
384 * clear that the old packet has been
385 * dropped. This is the new "fast" path mtu
386 * discovery.
387 */
388 tcp_simple_retransmit(sk);
389 } /* else let the usual retransmit timer handle it */
390}
4fab9071 391EXPORT_SYMBOL(tcp_v4_mtu_reduced);
1da177e4 392
55be7a9c
DM
393static void do_redirect(struct sk_buff *skb, struct sock *sk)
394{
395 struct dst_entry *dst = __sk_dst_check(sk, 0);
396
1ed5c48f 397 if (dst)
6700c270 398 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
399}
400
26e37360
ED
401
402/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
9cf74903 403void tcp_req_err(struct sock *sk, u32 seq, bool abort)
26e37360
ED
404{
405 struct request_sock *req = inet_reqsk(sk);
406 struct net *net = sock_net(sk);
407
408 /* ICMPs are not backlogged, hence we cannot get
409 * an established socket here.
410 */
26e37360 411 if (seq != tcp_rsk(req)->snt_isn) {
02a1d6e7 412 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
9cf74903 413 } else if (abort) {
26e37360
ED
414 /*
415 * Still in SYN_RECV, just remove it silently.
416 * There is no good way to pass the error to the newly
417 * created socket, and POSIX does not want network
418 * errors returned from accept().
419 */
c6973669 420 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
9caad864 421 tcp_listendrop(req->rsk_listener);
26e37360 422 }
ef84d8ce 423 reqsk_put(req);
26e37360
ED
424}
425EXPORT_SYMBOL(tcp_req_err);
426
f7456642 427/* TCP-LD (RFC 6069) logic */
d2924569 428void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
f7456642
ED
429{
430 struct inet_connection_sock *icsk = inet_csk(sk);
431 struct tcp_sock *tp = tcp_sk(sk);
432 struct sk_buff *skb;
433 s32 remaining;
434 u32 delta_us;
435
436 if (sock_owned_by_user(sk))
437 return;
438
439 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
440 !icsk->icsk_backoff)
441 return;
442
443 skb = tcp_rtx_queue_head(sk);
444 if (WARN_ON_ONCE(!skb))
445 return;
446
447 icsk->icsk_backoff--;
448 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
449 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
450
451 tcp_mstamp_refresh(tp);
452 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
453 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
454
455 if (remaining > 0) {
456 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
457 remaining, TCP_RTO_MAX);
458 } else {
459 /* RTO revert clocked out retransmission.
460 * Will retransmit now.
461 */
462 tcp_retransmit_timer(sk);
463 }
464}
d2924569 465EXPORT_SYMBOL(tcp_ld_RTO_revert);
f7456642 466
1da177e4
LT
467/*
468 * This routine is called by the ICMP module when it gets some
469 * sort of error condition. If err < 0 then the socket should
470 * be closed and the error returned to the user. If err > 0
471 * it's just the icmp type << 8 | icmp code. After adjustment
472 * header points to the first 8 bytes of the tcp header. We need
473 * to find the appropriate port.
474 *
475 * The locking strategy used here is very "optimistic". When
476 * someone else accesses the socket the ICMP is just dropped
477 * and for some paths there is no check at all.
478 * A more general error queue to queue errors for later handling
479 * is probably better.
480 *
481 */
482
a12daf13 483int tcp_v4_err(struct sk_buff *skb, u32 info)
1da177e4 484{
a12daf13
ED
485 const struct iphdr *iph = (const struct iphdr *)skb->data;
486 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
1da177e4
LT
487 struct tcp_sock *tp;
488 struct inet_sock *inet;
a12daf13
ED
489 const int type = icmp_hdr(skb)->type;
490 const int code = icmp_hdr(skb)->code;
1da177e4 491 struct sock *sk;
0a672f74 492 struct request_sock *fastopen;
9a568de4 493 u32 seq, snd_una;
1da177e4 494 int err;
a12daf13 495 struct net *net = dev_net(skb->dev);
1da177e4 496
26e37360
ED
497 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
498 th->dest, iph->saddr, ntohs(th->source),
a12daf13 499 inet_iif(skb), 0);
1da177e4 500 if (!sk) {
5d3848bc 501 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
32bbd879 502 return -ENOENT;
1da177e4
LT
503 }
504 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 505 inet_twsk_put(inet_twsk(sk));
32bbd879 506 return 0;
1da177e4 507 }
26e37360 508 seq = ntohl(th->seq);
32bbd879
SB
509 if (sk->sk_state == TCP_NEW_SYN_RECV) {
510 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
511 type == ICMP_TIME_EXCEEDED ||
512 (type == ICMP_DEST_UNREACH &&
513 (code == ICMP_NET_UNREACH ||
514 code == ICMP_HOST_UNREACH)));
515 return 0;
516 }
1da177e4
LT
517
518 bh_lock_sock(sk);
519 /* If too many ICMPs get dropped on busy
520 * servers this needs to be solved differently.
563d34d0
ED
521 * We do take care of PMTU discovery (RFC1191) special case :
522 * we can receive locally generated ICMP messages while socket is held.
1da177e4 523 */
b74aa930
ED
524 if (sock_owned_by_user(sk)) {
525 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
02a1d6e7 526 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
b74aa930 527 }
1da177e4
LT
528 if (sk->sk_state == TCP_CLOSE)
529 goto out;
530
020e71a3
ED
531 if (static_branch_unlikely(&ip4_min_ttl)) {
532 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
533 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
534 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
535 goto out;
536 }
97e3ecd1 537 }
538
1da177e4 539 tp = tcp_sk(sk);
0a672f74 540 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
d983ea6f 541 fastopen = rcu_dereference(tp->fastopen_rsk);
0a672f74 542 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
1da177e4 543 if (sk->sk_state != TCP_LISTEN &&
0a672f74 544 !between(seq, snd_una, tp->snd_nxt)) {
02a1d6e7 545 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
546 goto out;
547 }
548
549 switch (type) {
55be7a9c 550 case ICMP_REDIRECT:
45caeaa5 551 if (!sock_owned_by_user(sk))
a12daf13 552 do_redirect(skb, sk);
55be7a9c 553 goto out;
1da177e4
LT
554 case ICMP_SOURCE_QUENCH:
555 /* Just silently ignore these. */
556 goto out;
557 case ICMP_PARAMETERPROB:
558 err = EPROTO;
559 break;
560 case ICMP_DEST_UNREACH:
561 if (code > NR_ICMP_UNREACH)
562 goto out;
563
564 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
565 /* We are not interested in TCP_LISTEN and open_requests
566 * (SYN-ACKs send out by Linux are always <576bytes so
567 * they should go through unfragmented).
568 */
569 if (sk->sk_state == TCP_LISTEN)
570 goto out;
571
561022ac 572 WRITE_ONCE(tp->mtu_info, info);
144d56e9 573 if (!sock_owned_by_user(sk)) {
563d34d0 574 tcp_v4_mtu_reduced(sk);
144d56e9 575 } else {
7aa5470c 576 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
144d56e9
ED
577 sock_hold(sk);
578 }
1da177e4
LT
579 goto out;
580 }
581
582 err = icmp_err_convert[code].errno;
f7456642
ED
583 /* check if this ICMP message allows revert of backoff.
584 * (see RFC 6069)
585 */
586 if (!fastopen &&
587 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
588 tcp_ld_RTO_revert(sk, seq);
1da177e4
LT
589 break;
590 case ICMP_TIME_EXCEEDED:
591 err = EHOSTUNREACH;
592 break;
593 default:
594 goto out;
595 }
596
597 switch (sk->sk_state) {
1da177e4 598 case TCP_SYN_SENT:
0a672f74
YC
599 case TCP_SYN_RECV:
600 /* Only in fast or simultaneous open. If a fast open socket is
2bdcc73c 601 * already accepted it is treated as a connected one below.
0a672f74 602 */
51456b29 603 if (fastopen && !fastopen->sk)
0a672f74
YC
604 break;
605
a12daf13 606 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
45af29ca 607
1da177e4 608 if (!sock_owned_by_user(sk)) {
1da177e4
LT
609 sk->sk_err = err;
610
e3ae2365 611 sk_error_report(sk);
1da177e4
LT
612
613 tcp_done(sk);
614 } else {
615 sk->sk_err_soft = err;
616 }
617 goto out;
618 }
619
620 /* If we've already connected we will keep trying
621 * until we time out, or the user gives up.
622 *
623 * rfc1122 4.2.3.9 allows to consider as hard errors
624 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
625 * but it is obsoleted by pmtu discovery).
626 *
627 * Note, that in modern internet, where routing is unreliable
628 * and in each dark corner broken firewalls sit, sending random
629 * errors ordered by their masters even this two messages finally lose
630 * their original sense (even Linux sends invalid PORT_UNREACHs)
631 *
632 * Now we are in compliance with RFCs.
633 * --ANK (980905)
634 */
635
636 inet = inet_sk(sk);
637 if (!sock_owned_by_user(sk) && inet->recverr) {
638 sk->sk_err = err;
e3ae2365 639 sk_error_report(sk);
1da177e4
LT
640 } else { /* Only an error on timeout */
641 sk->sk_err_soft = err;
642 }
643
644out:
645 bh_unlock_sock(sk);
646 sock_put(sk);
32bbd879 647 return 0;
1da177e4
LT
648}
649
28850dc7 650void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 651{
aa8223c7 652 struct tcphdr *th = tcp_hdr(skb);
1da177e4 653
98be9b12
ED
654 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
655 skb->csum_start = skb_transport_header(skb) - skb->head;
656 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4
LT
657}
658
419f9f89 659/* This routine computes an IPv4 TCP checksum. */
bb296246 660void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 661{
cf533ea5 662 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
663
664 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
665}
4bc2f18b 666EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 667
1da177e4
LT
668/*
669 * This routine will send an RST to the other tcp.
670 *
671 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
672 * for reset.
673 * Answer: if a packet caused RST, it is not for a socket
674 * existing in our system, if it is matched to a socket,
675 * it is just duplicate segment or bug in other side's TCP.
676 * So that we build reply only basing on parameters
677 * arrived with segment.
678 * Exception: precedence violation. We do not implement it in any case.
679 */
680
dc87efdb
FW
681#ifdef CONFIG_TCP_MD5SIG
682#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
683#else
684#define OPTION_BYTES sizeof(__be32)
685#endif
686
a00e7444 687static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
1da177e4 688{
cf533ea5 689 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
690 struct {
691 struct tcphdr th;
dc87efdb 692 __be32 opt[OPTION_BYTES / sizeof(__be32)];
cfb6eeb4 693 } rep;
1da177e4 694 struct ip_reply_arg arg;
cfb6eeb4 695#ifdef CONFIG_TCP_MD5SIG
e46787f0 696 struct tcp_md5sig_key *key = NULL;
658ddaaf
SL
697 const __u8 *hash_location = NULL;
698 unsigned char newhash[16];
699 int genhash;
700 struct sock *sk1 = NULL;
cfb6eeb4 701#endif
d6fb396c 702 u64 transmit_time = 0;
00483690 703 struct sock *ctl_sk;
d6fb396c 704 struct net *net;
1da177e4
LT
705
706 /* Never send a reset in response to a reset. */
707 if (th->rst)
708 return;
709
c3658e8d
ED
710 /* If sk not NULL, it means we did a successful lookup and incoming
711 * route had to be correct. prequeue might have dropped our dst.
712 */
713 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
714 return;
715
716 /* Swap the send and the receive. */
cfb6eeb4
YH
717 memset(&rep, 0, sizeof(rep));
718 rep.th.dest = th->source;
719 rep.th.source = th->dest;
720 rep.th.doff = sizeof(struct tcphdr) / 4;
721 rep.th.rst = 1;
1da177e4
LT
722
723 if (th->ack) {
cfb6eeb4 724 rep.th.seq = th->ack_seq;
1da177e4 725 } else {
cfb6eeb4
YH
726 rep.th.ack = 1;
727 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
728 skb->len - (th->doff << 2));
1da177e4
LT
729 }
730
7174259e 731 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
732 arg.iov[0].iov_base = (unsigned char *)&rep;
733 arg.iov[0].iov_len = sizeof(rep.th);
734
0f85feae 735 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
cfb6eeb4 736#ifdef CONFIG_TCP_MD5SIG
3b24d854 737 rcu_read_lock();
658ddaaf 738 hash_location = tcp_parse_md5sig_option(th);
271c3b9b 739 if (sk && sk_fullsock(sk)) {
cea97609 740 const union tcp_md5_addr *addr;
dea53bb8 741 int l3index;
cea97609 742
dea53bb8
DA
743 /* sdif set, means packet ingressed via a device
744 * in an L3 domain and inet_iif is set to it.
745 */
746 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
cea97609 747 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
dea53bb8 748 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
e46787f0 749 } else if (hash_location) {
cea97609 750 const union tcp_md5_addr *addr;
534322ca
DA
751 int sdif = tcp_v4_sdif(skb);
752 int dif = inet_iif(skb);
dea53bb8 753 int l3index;
cea97609 754
658ddaaf
SL
755 /*
756 * active side is lost. Try to find listening socket through
757 * source port, and then find md5 key through listening socket.
758 * we are not loose security here:
759 * Incoming packet is checked with md5 hash with finding key,
760 * no RST generated if md5 hash doesn't match.
761 */
a583636a
CG
762 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
763 ip_hdr(skb)->saddr,
da5e3630 764 th->source, ip_hdr(skb)->daddr,
534322ca 765 ntohs(th->source), dif, sdif);
658ddaaf
SL
766 /* don't send rst if it can't find key */
767 if (!sk1)
3b24d854
ED
768 goto out;
769
dea53bb8
DA
770 /* sdif set, means packet ingressed via a device
771 * in an L3 domain and dif is set to it.
772 */
773 l3index = sdif ? dif : 0;
cea97609 774 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
dea53bb8 775 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
658ddaaf 776 if (!key)
3b24d854
ED
777 goto out;
778
658ddaaf 779
39f8e58e 780 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
658ddaaf 781 if (genhash || memcmp(hash_location, newhash, 16) != 0)
3b24d854
ED
782 goto out;
783
658ddaaf
SL
784 }
785
cfb6eeb4
YH
786 if (key) {
787 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
788 (TCPOPT_NOP << 16) |
789 (TCPOPT_MD5SIG << 8) |
790 TCPOLEN_MD5SIG);
791 /* Update length and the length the header thinks exists */
792 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
793 rep.th.doff = arg.iov[0].iov_len / 4;
794
49a72dfb 795 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
796 key, ip_hdr(skb)->saddr,
797 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
798 }
799#endif
dc87efdb
FW
800 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
801 if (rep.opt[0] == 0) {
802 __be32 mrst = mptcp_reset_option(skb);
803
804 if (mrst) {
805 rep.opt[0] = mrst;
806 arg.iov[0].iov_len += sizeof(mrst);
807 rep.th.doff = arg.iov[0].iov_len / 4;
808 }
809 }
810
eddc9ec5
ACM
811 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
812 ip_hdr(skb)->saddr, /* XXX */
52cd5750 813 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 814 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
271c3b9b
FW
815 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
816
e2446eaa 817 /* When socket is gone, all binding information is lost.
4c675258
AK
818 * routing might fail in this case. No choice here, if we choose to force
819 * input interface, we will misroute in case of asymmetric route.
e2446eaa 820 */
c24b14c4 821 if (sk) {
4c675258 822 arg.bound_dev_if = sk->sk_bound_dev_if;
5c487bb9
SL
823 if (sk_fullsock(sk))
824 trace_tcp_send_reset(sk, skb);
c24b14c4 825 }
1da177e4 826
271c3b9b
FW
827 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
828 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
829
66b13d99 830 arg.tos = ip_hdr(skb)->tos;
e2d118a1 831 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
47dcc20a 832 local_bh_disable();
37ba017d
ED
833 ctl_sk = this_cpu_read(ipv4_tcp_sk);
834 sock_net_set(ctl_sk, net);
a842fe14 835 if (sk) {
00483690
JM
836 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
837 inet_twsk(sk)->tw_mark : sk->sk_mark;
f6c0f5d2
ED
838 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
839 inet_twsk(sk)->tw_priority : sk->sk_priority;
d6fb396c 840 transmit_time = tcp_transmit_time(sk);
e22aa148 841 xfrm_sk_clone_policy(ctl_sk, sk);
a842fe14 842 }
00483690 843 ip_send_unicast_reply(ctl_sk,
bdbbb852 844 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d 845 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
d6fb396c
ED
846 &arg, arg.iov[0].iov_len,
847 transmit_time);
1da177e4 848
00483690 849 ctl_sk->sk_mark = 0;
e22aa148 850 xfrm_sk_free_policy(ctl_sk);
37ba017d 851 sock_net_set(ctl_sk, &init_net);
90bbcc60
ED
852 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
853 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
47dcc20a 854 local_bh_enable();
658ddaaf
SL
855
856#ifdef CONFIG_TCP_MD5SIG
3b24d854
ED
857out:
858 rcu_read_unlock();
658ddaaf 859#endif
1da177e4
LT
860}
861
862/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
863 outside socket context is ugly, certainly. What can I do?
864 */
865
e2d118a1 866static void tcp_v4_send_ack(const struct sock *sk,
e62a123b 867 struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 868 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 869 struct tcp_md5sig_key *key,
66b13d99 870 int reply_flags, u8 tos)
1da177e4 871{
cf533ea5 872 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
873 struct {
874 struct tcphdr th;
714e85be 875 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 876#ifdef CONFIG_TCP_MD5SIG
714e85be 877 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
878#endif
879 ];
1da177e4 880 } rep;
e2d118a1 881 struct net *net = sock_net(sk);
1da177e4 882 struct ip_reply_arg arg;
00483690 883 struct sock *ctl_sk;
d6fb396c 884 u64 transmit_time;
1da177e4
LT
885
886 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 887 memset(&arg, 0, sizeof(arg));
1da177e4
LT
888
889 arg.iov[0].iov_base = (unsigned char *)&rep;
890 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 891 if (tsecr) {
cfb6eeb4
YH
892 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
893 (TCPOPT_TIMESTAMP << 8) |
894 TCPOLEN_TIMESTAMP);
ee684b6f
AV
895 rep.opt[1] = htonl(tsval);
896 rep.opt[2] = htonl(tsecr);
cb48cfe8 897 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
898 }
899
900 /* Swap the send and the receive. */
901 rep.th.dest = th->source;
902 rep.th.source = th->dest;
903 rep.th.doff = arg.iov[0].iov_len / 4;
904 rep.th.seq = htonl(seq);
905 rep.th.ack_seq = htonl(ack);
906 rep.th.ack = 1;
907 rep.th.window = htons(win);
908
cfb6eeb4 909#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 910 if (key) {
ee684b6f 911 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
912
913 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
914 (TCPOPT_NOP << 16) |
915 (TCPOPT_MD5SIG << 8) |
916 TCPOLEN_MD5SIG);
917 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
918 rep.th.doff = arg.iov[0].iov_len/4;
919
49a72dfb 920 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
921 key, ip_hdr(skb)->saddr,
922 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
923 }
924#endif
88ef4a5a 925 arg.flags = reply_flags;
eddc9ec5
ACM
926 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
927 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
928 arg.iov[0].iov_len, IPPROTO_TCP, 0);
929 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
930 if (oif)
931 arg.bound_dev_if = oif;
66b13d99 932 arg.tos = tos;
e2d118a1 933 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
47dcc20a 934 local_bh_disable();
37ba017d
ED
935 ctl_sk = this_cpu_read(ipv4_tcp_sk);
936 sock_net_set(ctl_sk, net);
a842fe14
ED
937 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
938 inet_twsk(sk)->tw_mark : sk->sk_mark;
f6c0f5d2
ED
939 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
940 inet_twsk(sk)->tw_priority : sk->sk_priority;
d6fb396c 941 transmit_time = tcp_transmit_time(sk);
00483690 942 ip_send_unicast_reply(ctl_sk,
bdbbb852 943 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d 944 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
d6fb396c
ED
945 &arg, arg.iov[0].iov_len,
946 transmit_time);
1da177e4 947
00483690 948 ctl_sk->sk_mark = 0;
37ba017d 949 sock_net_set(ctl_sk, &init_net);
90bbcc60 950 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
47dcc20a 951 local_bh_enable();
1da177e4
LT
952}
953
954static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
955{
8feaf0c0 956 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 957 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 958
e2d118a1 959 tcp_v4_send_ack(sk, skb,
e62a123b 960 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 961 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
9a568de4 962 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
9501f972
YH
963 tcptw->tw_ts_recent,
964 tw->tw_bound_dev_if,
88ef4a5a 965 tcp_twsk_md5_key(tcptw),
66b13d99
ED
966 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
967 tw->tw_tos
9501f972 968 );
1da177e4 969
8feaf0c0 970 inet_twsk_put(tw);
1da177e4
LT
971}
972
a00e7444 973static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
7174259e 974 struct request_sock *req)
1da177e4 975{
cea97609 976 const union tcp_md5_addr *addr;
dea53bb8 977 int l3index;
cea97609 978
168a8f58
JC
979 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
980 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
981 */
e62a123b
ED
982 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
983 tcp_sk(sk)->snd_nxt;
984
20a2b49f
ED
985 /* RFC 7323 2.3
986 * The window field (SEG.WND) of every outgoing segment, with the
987 * exception of <SYN> segments, MUST be right-shifted by
988 * Rcv.Wind.Shift bits:
989 */
cea97609 990 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
dea53bb8 991 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
e2d118a1 992 tcp_v4_send_ack(sk, skb, seq,
20a2b49f
ED
993 tcp_rsk(req)->rcv_nxt,
994 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
9a568de4 995 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
9501f972
YH
996 req->ts_recent,
997 0,
dea53bb8 998 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
66b13d99
ED
999 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1000 ip_hdr(skb)->tos);
1da177e4
LT
1001}
1002
1da177e4 1003/*
9bf1d83e 1004 * Send a SYN-ACK after having received a SYN.
60236fdd 1005 * This still operates on a request_sock only, not on a big
1da177e4
LT
1006 * socket.
1007 */
0f935dbe 1008static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
d6274bd8 1009 struct flowi *fl,
72659ecc 1010 struct request_sock *req,
ca6fb065 1011 struct tcp_fastopen_cookie *foc,
331fca43
MKL
1012 enum tcp_synack_type synack_type,
1013 struct sk_buff *syn_skb)
1da177e4 1014{
2e6599cb 1015 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 1016 struct flowi4 fl4;
1da177e4 1017 int err = -1;
d41db5af 1018 struct sk_buff *skb;
ac8f1710 1019 u8 tos;
1da177e4
LT
1020
1021 /* First, grab a route. */
ba3f7f04 1022 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 1023 return -1;
1da177e4 1024
331fca43 1025 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1da177e4
LT
1026
1027 if (skb) {
634fb979 1028 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1da177e4 1029
870e3a63 1030 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
8ef44b6f
WW
1031 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1032 (inet_sk(sk)->tos & INET_ECN_MASK) :
407c85c7
AD
1033 inet_sk(sk)->tos;
1034
1035 if (!INET_ECN_is_capable(tos) &&
1036 tcp_bpf_ca_needs_ecn((struct sock *)req))
1037 tos |= INET_ECN_ECT_0;
1038
2ab2ddd3 1039 rcu_read_lock();
634fb979
ED
1040 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1041 ireq->ir_rmt_addr,
de033b7d 1042 rcu_dereference(ireq->ireq_opt),
861602b5 1043 tos);
2ab2ddd3 1044 rcu_read_unlock();
b9df3cb8 1045 err = net_xmit_eval(err);
1da177e4
LT
1046 }
1047
1da177e4
LT
1048 return err;
1049}
1050
1051/*
60236fdd 1052 * IPv4 request_sock destructor.
1da177e4 1053 */
60236fdd 1054static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 1055{
c92e8c02 1056 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1da177e4
LT
1057}
1058
cfb6eeb4
YH
1059#ifdef CONFIG_TCP_MD5SIG
1060/*
1061 * RFC2385 MD5 checksumming requires a mapping of
1062 * IP address->MD5 Key.
1063 * We need to maintain these in the sk structure.
1064 */
1065
921f9a0f 1066DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
6015c71e
ED
1067EXPORT_SYMBOL(tcp_md5_needed);
1068
86f1e3a8
LC
1069static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1070{
1071 if (!old)
1072 return true;
1073
1074 /* l3index always overrides non-l3index */
1075 if (old->l3index && new->l3index == 0)
1076 return false;
1077 if (old->l3index == 0 && new->l3index)
1078 return true;
1079
1080 return old->prefixlen < new->prefixlen;
1081}
1082
cfb6eeb4 1083/* Find the Key structure for an address. */
dea53bb8 1084struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
6015c71e
ED
1085 const union tcp_md5_addr *addr,
1086 int family)
cfb6eeb4 1087{
fd3a154a 1088 const struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1089 struct tcp_md5sig_key *key;
fd3a154a 1090 const struct tcp_md5sig_info *md5sig;
6797318e
ID
1091 __be32 mask;
1092 struct tcp_md5sig_key *best_match = NULL;
1093 bool match;
cfb6eeb4 1094
a8afca03
ED
1095 /* caller either holds rcu_read_lock() or socket lock */
1096 md5sig = rcu_dereference_check(tp->md5sig_info,
1e1d04e6 1097 lockdep_sock_is_held(sk));
a8afca03 1098 if (!md5sig)
cfb6eeb4 1099 return NULL;
083a0326 1100
c8b91770
AG
1101 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1102 lockdep_sock_is_held(sk)) {
a915da9b
ED
1103 if (key->family != family)
1104 continue;
a76c2315 1105 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
dea53bb8 1106 continue;
6797318e
ID
1107 if (family == AF_INET) {
1108 mask = inet_make_mask(key->prefixlen);
1109 match = (key->addr.a4.s_addr & mask) ==
1110 (addr->a4.s_addr & mask);
1111#if IS_ENABLED(CONFIG_IPV6)
1112 } else if (family == AF_INET6) {
1113 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1114 key->prefixlen);
1115#endif
1116 } else {
1117 match = false;
1118 }
1119
86f1e3a8 1120 if (match && better_md5_match(best_match, key))
6797318e
ID
1121 best_match = key;
1122 }
1123 return best_match;
1124}
6015c71e 1125EXPORT_SYMBOL(__tcp_md5_do_lookup);
6797318e 1126
e8f37d57
WF
1127static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1128 const union tcp_md5_addr *addr,
dea53bb8 1129 int family, u8 prefixlen,
a76c2315 1130 int l3index, u8 flags)
6797318e
ID
1131{
1132 const struct tcp_sock *tp = tcp_sk(sk);
1133 struct tcp_md5sig_key *key;
1134 unsigned int size = sizeof(struct in_addr);
1135 const struct tcp_md5sig_info *md5sig;
1136
1137 /* caller either holds rcu_read_lock() or socket lock */
1138 md5sig = rcu_dereference_check(tp->md5sig_info,
1139 lockdep_sock_is_held(sk));
1140 if (!md5sig)
1141 return NULL;
1142#if IS_ENABLED(CONFIG_IPV6)
1143 if (family == AF_INET6)
1144 size = sizeof(struct in6_addr);
1145#endif
c8b91770
AG
1146 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1147 lockdep_sock_is_held(sk)) {
6797318e
ID
1148 if (key->family != family)
1149 continue;
a76c2315
LC
1150 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1151 continue;
86f1e3a8 1152 if (key->l3index != l3index)
dea53bb8 1153 continue;
6797318e
ID
1154 if (!memcmp(&key->addr, addr, size) &&
1155 key->prefixlen == prefixlen)
a915da9b 1156 return key;
cfb6eeb4
YH
1157 }
1158 return NULL;
1159}
1160
b83e3deb 1161struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
fd3a154a 1162 const struct sock *addr_sk)
cfb6eeb4 1163{
b52e6921 1164 const union tcp_md5_addr *addr;
dea53bb8 1165 int l3index;
a915da9b 1166
dea53bb8
DA
1167 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1168 addr_sk->sk_bound_dev_if);
b52e6921 1169 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
dea53bb8 1170 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
cfb6eeb4 1171}
cfb6eeb4
YH
1172EXPORT_SYMBOL(tcp_v4_md5_lookup);
1173
cfb6eeb4 1174/* This can be called on a newly created socket, from other files */
a915da9b 1175int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
a76c2315 1176 int family, u8 prefixlen, int l3index, u8 flags,
dea53bb8 1177 const u8 *newkey, u8 newkeylen, gfp_t gfp)
cfb6eeb4
YH
1178{
1179 /* Add Key to the list */
b0a713e9 1180 struct tcp_md5sig_key *key;
cfb6eeb4 1181 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1182 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1183
a76c2315 1184 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
cfb6eeb4 1185 if (key) {
e6ced831
ED
1186 /* Pre-existing entry - just update that one.
1187 * Note that the key might be used concurrently.
1188 * data_race() is telling kcsan that we do not care of
1189 * key mismatches, since changing MD5 key on live flows
1190 * can lead to packet drops.
1191 */
1192 data_race(memcpy(key->key, newkey, newkeylen));
6a2febec 1193
e6ced831
ED
1194 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1195 * Also note that a reader could catch new key->keylen value
1196 * but old key->key[], this is the reason we use __GFP_ZERO
1197 * at sock_kmalloc() time below these lines.
1198 */
1199 WRITE_ONCE(key->keylen, newkeylen);
6a2febec 1200
a915da9b
ED
1201 return 0;
1202 }
260fcbeb 1203
a8afca03 1204 md5sig = rcu_dereference_protected(tp->md5sig_info,
1e1d04e6 1205 lockdep_sock_is_held(sk));
a915da9b
ED
1206 if (!md5sig) {
1207 md5sig = kmalloc(sizeof(*md5sig), gfp);
1208 if (!md5sig)
cfb6eeb4 1209 return -ENOMEM;
cfb6eeb4 1210
aba54656 1211 sk_gso_disable(sk);
a915da9b 1212 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 1213 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 1214 }
cfb6eeb4 1215
e6ced831 1216 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
a915da9b
ED
1217 if (!key)
1218 return -ENOMEM;
71cea17e 1219 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 1220 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 1221 return -ENOMEM;
cfb6eeb4 1222 }
a915da9b
ED
1223
1224 memcpy(key->key, newkey, newkeylen);
1225 key->keylen = newkeylen;
1226 key->family = family;
6797318e 1227 key->prefixlen = prefixlen;
dea53bb8 1228 key->l3index = l3index;
a76c2315 1229 key->flags = flags;
a915da9b 1230 memcpy(&key->addr, addr,
3a2cd89b 1231 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1232 sizeof(struct in_addr));
a915da9b 1233 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
1234 return 0;
1235}
a915da9b 1236EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 1237
6797318e 1238int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
a76c2315 1239 u8 prefixlen, int l3index, u8 flags)
cfb6eeb4 1240{
a915da9b
ED
1241 struct tcp_md5sig_key *key;
1242
a76c2315 1243 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
a915da9b
ED
1244 if (!key)
1245 return -ENOENT;
1246 hlist_del_rcu(&key->node);
5f3d9cb2 1247 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1248 kfree_rcu(key, rcu);
a915da9b 1249 return 0;
cfb6eeb4 1250}
a915da9b 1251EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 1252
e0683e70 1253static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
1254{
1255 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1256 struct tcp_md5sig_key *key;
b67bfe0d 1257 struct hlist_node *n;
a8afca03 1258 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1259
a8afca03
ED
1260 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1261
b67bfe0d 1262 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1263 hlist_del_rcu(&key->node);
5f3d9cb2 1264 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1265 kfree_rcu(key, rcu);
cfb6eeb4
YH
1266 }
1267}
1268
8917a777 1269static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
d4c19c49 1270 sockptr_t optval, int optlen)
cfb6eeb4
YH
1271{
1272 struct tcp_md5sig cmd;
1273 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
cea97609 1274 const union tcp_md5_addr *addr;
8917a777 1275 u8 prefixlen = 32;
dea53bb8 1276 int l3index = 0;
a76c2315 1277 u8 flags;
cfb6eeb4
YH
1278
1279 if (optlen < sizeof(cmd))
1280 return -EINVAL;
1281
d4c19c49 1282 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1283 return -EFAULT;
1284
1285 if (sin->sin_family != AF_INET)
1286 return -EINVAL;
1287
a76c2315
LC
1288 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1289
8917a777
ID
1290 if (optname == TCP_MD5SIG_EXT &&
1291 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1292 prefixlen = cmd.tcpm_prefixlen;
1293 if (prefixlen > 32)
1294 return -EINVAL;
1295 }
1296
a76c2315 1297 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
6b102db5
DA
1298 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1299 struct net_device *dev;
1300
1301 rcu_read_lock();
1302 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1303 if (dev && netif_is_l3_master(dev))
1304 l3index = dev->ifindex;
1305
1306 rcu_read_unlock();
1307
1308 /* ok to reference set/not set outside of rcu;
1309 * right now device MUST be an L3 master
1310 */
1311 if (!dev || !l3index)
1312 return -EINVAL;
1313 }
1314
cea97609
DA
1315 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1316
64a124ed 1317 if (!cmd.tcpm_keylen)
a76c2315 1318 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
cfb6eeb4
YH
1319
1320 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1321 return -EINVAL;
1322
a76c2315 1323 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
cea97609 1324 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
cfb6eeb4
YH
1325}
1326
19689e38
ED
1327static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1328 __be32 daddr, __be32 saddr,
1329 const struct tcphdr *th, int nbytes)
cfb6eeb4 1330{
cfb6eeb4 1331 struct tcp4_pseudohdr *bp;
49a72dfb 1332 struct scatterlist sg;
19689e38 1333 struct tcphdr *_th;
cfb6eeb4 1334
19689e38 1335 bp = hp->scratch;
cfb6eeb4
YH
1336 bp->saddr = saddr;
1337 bp->daddr = daddr;
1338 bp->pad = 0;
076fb722 1339 bp->protocol = IPPROTO_TCP;
49a72dfb 1340 bp->len = cpu_to_be16(nbytes);
c7da57a1 1341
19689e38
ED
1342 _th = (struct tcphdr *)(bp + 1);
1343 memcpy(_th, th, sizeof(*th));
1344 _th->check = 0;
1345
1346 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1347 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1348 sizeof(*bp) + sizeof(*th));
cf80e0e4 1349 return crypto_ahash_update(hp->md5_req);
49a72dfb
AL
1350}
1351
a915da9b 1352static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1353 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1354{
1355 struct tcp_md5sig_pool *hp;
cf80e0e4 1356 struct ahash_request *req;
49a72dfb
AL
1357
1358 hp = tcp_get_md5sig_pool();
1359 if (!hp)
1360 goto clear_hash_noput;
cf80e0e4 1361 req = hp->md5_req;
49a72dfb 1362
cf80e0e4 1363 if (crypto_ahash_init(req))
49a72dfb 1364 goto clear_hash;
19689e38 1365 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
49a72dfb
AL
1366 goto clear_hash;
1367 if (tcp_md5_hash_key(hp, key))
1368 goto clear_hash;
cf80e0e4
HX
1369 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1370 if (crypto_ahash_final(req))
cfb6eeb4
YH
1371 goto clear_hash;
1372
cfb6eeb4 1373 tcp_put_md5sig_pool();
cfb6eeb4 1374 return 0;
49a72dfb 1375
cfb6eeb4
YH
1376clear_hash:
1377 tcp_put_md5sig_pool();
1378clear_hash_noput:
1379 memset(md5_hash, 0, 16);
49a72dfb 1380 return 1;
cfb6eeb4
YH
1381}
1382
39f8e58e
ED
1383int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1384 const struct sock *sk,
318cf7aa 1385 const struct sk_buff *skb)
cfb6eeb4 1386{
49a72dfb 1387 struct tcp_md5sig_pool *hp;
cf80e0e4 1388 struct ahash_request *req;
318cf7aa 1389 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1390 __be32 saddr, daddr;
1391
39f8e58e
ED
1392 if (sk) { /* valid for establish/request sockets */
1393 saddr = sk->sk_rcv_saddr;
1394 daddr = sk->sk_daddr;
cfb6eeb4 1395 } else {
49a72dfb
AL
1396 const struct iphdr *iph = ip_hdr(skb);
1397 saddr = iph->saddr;
1398 daddr = iph->daddr;
cfb6eeb4 1399 }
49a72dfb
AL
1400
1401 hp = tcp_get_md5sig_pool();
1402 if (!hp)
1403 goto clear_hash_noput;
cf80e0e4 1404 req = hp->md5_req;
49a72dfb 1405
cf80e0e4 1406 if (crypto_ahash_init(req))
49a72dfb
AL
1407 goto clear_hash;
1408
19689e38 1409 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
49a72dfb
AL
1410 goto clear_hash;
1411 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1412 goto clear_hash;
1413 if (tcp_md5_hash_key(hp, key))
1414 goto clear_hash;
cf80e0e4
HX
1415 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1416 if (crypto_ahash_final(req))
49a72dfb
AL
1417 goto clear_hash;
1418
1419 tcp_put_md5sig_pool();
1420 return 0;
1421
1422clear_hash:
1423 tcp_put_md5sig_pool();
1424clear_hash_noput:
1425 memset(md5_hash, 0, 16);
1426 return 1;
cfb6eeb4 1427}
49a72dfb 1428EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1429
ba8e275a
ED
1430#endif
1431
b40cf18e
ED
1432static void tcp_v4_init_req(struct request_sock *req,
1433 const struct sock *sk_listener,
16bea70a
OP
1434 struct sk_buff *skb)
1435{
1436 struct inet_request_sock *ireq = inet_rsk(req);
c92e8c02 1437 struct net *net = sock_net(sk_listener);
16bea70a 1438
08d2cc3b
ED
1439 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1440 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
c92e8c02 1441 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
16bea70a
OP
1442}
1443
f964629e 1444static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
7ea851d1 1445 struct sk_buff *skb,
f964629e 1446 struct flowi *fl,
7ea851d1 1447 struct request_sock *req)
d94e0417 1448{
7ea851d1
FW
1449 tcp_v4_init_req(req, sk, skb);
1450
1451 if (security_inet_conn_request(sk, skb, req))
1452 return NULL;
1453
4396e461 1454 return inet_csk_route_req(sk, &fl->u.ip4, req);
d94e0417
OP
1455}
1456
72a3effa 1457struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1458 .family = PF_INET,
2e6599cb 1459 .obj_size = sizeof(struct tcp_request_sock),
5db92c99 1460 .rtx_syn_ack = tcp_rtx_synack,
60236fdd
ACM
1461 .send_ack = tcp_v4_reqsk_send_ack,
1462 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1463 .send_reset = tcp_v4_send_reset,
688d1945 1464 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1465};
1466
35b2c321 1467const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
2aec4a29 1468 .mss_clamp = TCP_MSS_DEFAULT,
16bea70a 1469#ifdef CONFIG_TCP_MD5SIG
fd3a154a 1470 .req_md5_lookup = tcp_v4_md5_lookup,
e3afe7b7 1471 .calc_md5_hash = tcp_v4_md5_hash_skb,
b6332e6c 1472#endif
fb7b37a7
OP
1473#ifdef CONFIG_SYN_COOKIES
1474 .cookie_init_seq = cookie_v4_init_sequence,
1475#endif
d94e0417 1476 .route_req = tcp_v4_route_req,
84b114b9
ED
1477 .init_seq = tcp_v4_init_seq,
1478 .init_ts_off = tcp_v4_init_ts_off,
d6274bd8 1479 .send_synack = tcp_v4_send_synack,
16bea70a 1480};
cfb6eeb4 1481
1da177e4
LT
1482int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1483{
1da177e4 1484 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1485 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1486 goto drop;
1487
1fb6f159
OP
1488 return tcp_conn_request(&tcp_request_sock_ops,
1489 &tcp_request_sock_ipv4_ops, sk, skb);
1da177e4 1490
1da177e4 1491drop:
9caad864 1492 tcp_listendrop(sk);
1da177e4
LT
1493 return 0;
1494}
4bc2f18b 1495EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1496
1497
1498/*
1499 * The three way handshake has completed - we got a valid synack -
1500 * now create the new socket.
1501 */
0c27171e 1502struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
60236fdd 1503 struct request_sock *req,
5e0724d0
ED
1504 struct dst_entry *dst,
1505 struct request_sock *req_unhash,
1506 bool *own_req)
1da177e4 1507{
2e6599cb 1508 struct inet_request_sock *ireq;
01770a16 1509 bool found_dup_sk = false;
1da177e4
LT
1510 struct inet_sock *newinet;
1511 struct tcp_sock *newtp;
1512 struct sock *newsk;
cfb6eeb4 1513#ifdef CONFIG_TCP_MD5SIG
cea97609 1514 const union tcp_md5_addr *addr;
cfb6eeb4 1515 struct tcp_md5sig_key *key;
dea53bb8 1516 int l3index;
cfb6eeb4 1517#endif
f6d8bd05 1518 struct ip_options_rcu *inet_opt;
1da177e4
LT
1519
1520 if (sk_acceptq_is_full(sk))
1521 goto exit_overflow;
1522
1da177e4
LT
1523 newsk = tcp_create_openreq_child(sk, req, skb);
1524 if (!newsk)
093d2823 1525 goto exit_nonewsk;
1da177e4 1526
bcd76111 1527 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1528 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1529
1530 newtp = tcp_sk(newsk);
1531 newinet = inet_sk(newsk);
2e6599cb 1532 ireq = inet_rsk(req);
d1e559d0
ED
1533 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1534 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
6dd9a14e 1535 newsk->sk_bound_dev_if = ireq->ir_iif;
c92e8c02
ED
1536 newinet->inet_saddr = ireq->ir_loc_addr;
1537 inet_opt = rcu_dereference(ireq->ireq_opt);
1538 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
463c84b9 1539 newinet->mc_index = inet_iif(skb);
eddc9ec5 1540 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1541 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1542 inet_csk(newsk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
1543 if (inet_opt)
1544 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
a904a069 1545 newinet->inet_id = prandom_u32();
1da177e4 1546
8ef44b6f
WW
1547 /* Set ToS of the new socket based upon the value of incoming SYN.
1548 * ECT bits are set later in tcp_init_transfer().
1549 */
870e3a63 1550 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
ac8f1710
WW
1551 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1552
dfd25fff
ED
1553 if (!dst) {
1554 dst = inet_csk_route_child_sock(sk, newsk, req);
1555 if (!dst)
1556 goto put_and_exit;
1557 } else {
1558 /* syncookie case : see end of cookie_v4_check() */
1559 }
0e734419
DM
1560 sk_setup_caps(newsk, dst);
1561
81164413
DB
1562 tcp_ca_openreq_child(newsk, dst);
1563
1da177e4 1564 tcp_sync_mss(newsk, dst_mtu(dst));
3541f9e8 1565 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
f5fff5dc 1566
1da177e4
LT
1567 tcp_initialize_rcv_mss(newsk);
1568
cfb6eeb4 1569#ifdef CONFIG_TCP_MD5SIG
dea53bb8 1570 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
cfb6eeb4 1571 /* Copy over the MD5 key from the original socket */
cea97609 1572 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
dea53bb8 1573 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
00db4124 1574 if (key) {
cfb6eeb4
YH
1575 /*
1576 * We're using one, so create a matching key
1577 * on the newsk structure. If we fail to get
1578 * memory, then we end up not copying the key
1579 * across. Shucks.
1580 */
a76c2315 1581 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
cea97609 1582 key->key, key->keylen, GFP_ATOMIC);
aba54656 1583 sk_gso_disable(newsk);
cfb6eeb4
YH
1584 }
1585#endif
1586
0e734419
DM
1587 if (__inet_inherit_port(sk, newsk) < 0)
1588 goto put_and_exit;
01770a16
RD
1589 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1590 &found_dup_sk);
c92e8c02 1591 if (likely(*own_req)) {
49a496c9 1592 tcp_move_syn(newtp, req);
c92e8c02
ED
1593 ireq->ireq_opt = NULL;
1594 } else {
c89dffc7
KI
1595 newinet->inet_opt = NULL;
1596
01770a16
RD
1597 if (!req_unhash && found_dup_sk) {
1598 /* This code path should only be executed in the
1599 * syncookie case only
1600 */
1601 bh_unlock_sock(newsk);
1602 sock_put(newsk);
1603 newsk = NULL;
01770a16 1604 }
c92e8c02 1605 }
1da177e4
LT
1606 return newsk;
1607
1608exit_overflow:
c10d9310 1609 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1610exit_nonewsk:
1611 dst_release(dst);
1da177e4 1612exit:
9caad864 1613 tcp_listendrop(sk);
1da177e4 1614 return NULL;
0e734419 1615put_and_exit:
c92e8c02 1616 newinet->inet_opt = NULL;
e337e24d
CP
1617 inet_csk_prepare_forced_close(newsk);
1618 tcp_done(newsk);
0e734419 1619 goto exit;
1da177e4 1620}
4bc2f18b 1621EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4 1622
079096f1 1623static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1da177e4 1624{
079096f1 1625#ifdef CONFIG_SYN_COOKIES
52452c54 1626 const struct tcphdr *th = tcp_hdr(skb);
1da177e4 1627
af9b4738 1628 if (!th->syn)
461b74c3 1629 sk = cookie_v4_check(sk, skb);
1da177e4
LT
1630#endif
1631 return sk;
1632}
1633
9349d600
PP
1634u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1635 struct tcphdr *th, u32 *cookie)
1636{
1637 u16 mss = 0;
1638#ifdef CONFIG_SYN_COOKIES
1639 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1640 &tcp_request_sock_ipv4_ops, sk, th);
1641 if (mss) {
1642 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1643 tcp_synq_overflow(sk);
1644 }
1645#endif
1646 return mss;
1647}
1648
bbd807df
BV
1649INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1650 u32));
1da177e4 1651/* The socket must have it's spinlock held when we get
e994b2f0 1652 * here, unless it is a TCP_LISTEN socket.
1da177e4
LT
1653 *
1654 * We have a potential double-lock case here, so even when
1655 * doing backlog processing we use the BH locking scheme.
1656 * This is because we cannot sleep with the original spinlock
1657 * held.
1658 */
1659int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1660{
8eba65fa 1661 enum skb_drop_reason reason;
cfb6eeb4 1662 struct sock *rsk;
cfb6eeb4 1663
1da177e4 1664 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
8f905c0e
ED
1665 struct dst_entry *dst;
1666
1667 dst = rcu_dereference_protected(sk->sk_rx_dst,
1668 lockdep_sock_is_held(sk));
404e0a8b 1669
bdeab991 1670 sock_rps_save_rxhash(sk, skb);
3d97379a 1671 sk_mark_napi_id(sk, skb);
404e0a8b 1672 if (dst) {
0c0a5ef8 1673 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
bbd807df
BV
1674 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1675 dst, 0)) {
8f905c0e 1676 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
92101b3b 1677 dst_release(dst);
92101b3b
DM
1678 }
1679 }
3d97d88e 1680 tcp_rcv_established(sk, skb);
1da177e4
LT
1681 return 0;
1682 }
1683
8eba65fa 1684 reason = SKB_DROP_REASON_NOT_SPECIFIED;
12e25e10 1685 if (tcp_checksum_complete(skb))
1da177e4
LT
1686 goto csum_err;
1687
1688 if (sk->sk_state == TCP_LISTEN) {
079096f1
ED
1689 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1690
1da177e4
LT
1691 if (!nsk)
1692 goto discard;
1da177e4 1693 if (nsk != sk) {
cfb6eeb4
YH
1694 if (tcp_child_process(sk, nsk, skb)) {
1695 rsk = nsk;
1da177e4 1696 goto reset;
cfb6eeb4 1697 }
1da177e4
LT
1698 return 0;
1699 }
ca55158c 1700 } else
bdeab991 1701 sock_rps_save_rxhash(sk, skb);
ca55158c 1702
72ab4a86 1703 if (tcp_rcv_state_process(sk, skb)) {
cfb6eeb4 1704 rsk = sk;
1da177e4 1705 goto reset;
cfb6eeb4 1706 }
1da177e4
LT
1707 return 0;
1708
1709reset:
cfb6eeb4 1710 tcp_v4_send_reset(rsk, skb);
1da177e4 1711discard:
8eba65fa 1712 kfree_skb_reason(skb, reason);
1da177e4
LT
1713 /* Be careful here. If this function gets more complicated and
1714 * gcc suffers from register pressure on the x86, sk (in %ebx)
1715 * might be destroyed here. This current version compiles correctly,
1716 * but you have been warned.
1717 */
1718 return 0;
1719
1720csum_err:
8eba65fa 1721 reason = SKB_DROP_REASON_TCP_CSUM;
709c0314 1722 trace_tcp_bad_csum(skb);
c10d9310
ED
1723 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1724 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1725 goto discard;
1726}
4bc2f18b 1727EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1728
7487449c 1729int tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1730{
41063e9d
DM
1731 const struct iphdr *iph;
1732 const struct tcphdr *th;
1733 struct sock *sk;
41063e9d 1734
41063e9d 1735 if (skb->pkt_type != PACKET_HOST)
7487449c 1736 return 0;
41063e9d 1737
45f00f99 1738 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
7487449c 1739 return 0;
41063e9d
DM
1740
1741 iph = ip_hdr(skb);
45f00f99 1742 th = tcp_hdr(skb);
41063e9d
DM
1743
1744 if (th->doff < sizeof(struct tcphdr) / 4)
7487449c 1745 return 0;
41063e9d 1746
45f00f99 1747 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1748 iph->saddr, th->source,
7011d085 1749 iph->daddr, ntohs(th->dest),
3fa6f616 1750 skb->skb_iif, inet_sdif(skb));
41063e9d
DM
1751 if (sk) {
1752 skb->sk = sk;
1753 skb->destructor = sock_edemux;
f7e4eb03 1754 if (sk_fullsock(sk)) {
8f905c0e 1755 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
505fbcf0 1756
41063e9d
DM
1757 if (dst)
1758 dst = dst_check(dst, 0);
92101b3b 1759 if (dst &&
0c0a5ef8 1760 sk->sk_rx_dst_ifindex == skb->skb_iif)
92101b3b 1761 skb_dst_set_noref(skb, dst);
41063e9d
DM
1762 }
1763 }
7487449c 1764 return 0;
41063e9d
DM
1765}
1766
7a26dc9e
MD
1767bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1768 enum skb_drop_reason *reason)
c9c33212 1769{
d519f350 1770 u32 limit, tail_gso_size, tail_gso_segs;
4f693b55
ED
1771 struct skb_shared_info *shinfo;
1772 const struct tcphdr *th;
1773 struct tcphdr *thtail;
1774 struct sk_buff *tail;
1775 unsigned int hdrlen;
1776 bool fragstolen;
1777 u32 gso_segs;
b160c285 1778 u32 gso_size;
4f693b55 1779 int delta;
c9c33212
ED
1780
1781 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1782 * we can fix skb->truesize to its real value to avoid future drops.
1783 * This is valid because skb is not yet charged to the socket.
1784 * It has been noticed pure SACK packets were sometimes dropped
1785 * (if cooked by drivers without copybreak feature).
1786 */
60b1af33 1787 skb_condense(skb);
c9c33212 1788
ade9628e
ED
1789 skb_dst_drop(skb);
1790
4f693b55
ED
1791 if (unlikely(tcp_checksum_complete(skb))) {
1792 bh_unlock_sock(sk);
709c0314 1793 trace_tcp_bad_csum(skb);
7a26dc9e 1794 *reason = SKB_DROP_REASON_TCP_CSUM;
4f693b55
ED
1795 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1796 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1797 return true;
1798 }
1799
1800 /* Attempt coalescing to last skb in backlog, even if we are
1801 * above the limits.
1802 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1803 */
1804 th = (const struct tcphdr *)skb->data;
1805 hdrlen = th->doff * 4;
4f693b55
ED
1806
1807 tail = sk->sk_backlog.tail;
1808 if (!tail)
1809 goto no_coalesce;
1810 thtail = (struct tcphdr *)tail->data;
1811
1812 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1813 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1814 ((TCP_SKB_CB(tail)->tcp_flags |
ca2fe295
ED
1815 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1816 !((TCP_SKB_CB(tail)->tcp_flags &
1817 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
4f693b55
ED
1818 ((TCP_SKB_CB(tail)->tcp_flags ^
1819 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1820#ifdef CONFIG_TLS_DEVICE
1821 tail->decrypted != skb->decrypted ||
1822#endif
1823 thtail->doff != th->doff ||
1824 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1825 goto no_coalesce;
1826
1827 __skb_pull(skb, hdrlen);
b160c285
ED
1828
1829 shinfo = skb_shinfo(skb);
1830 gso_size = shinfo->gso_size ?: skb->len;
1831 gso_segs = shinfo->gso_segs ?: 1;
1832
1833 shinfo = skb_shinfo(tail);
1834 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1835 tail_gso_segs = shinfo->gso_segs ?: 1;
1836
4f693b55 1837 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
4f693b55
ED
1838 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1839
86bccd03 1840 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
4f693b55 1841 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
86bccd03
ED
1842 thtail->window = th->window;
1843 }
4f693b55 1844
ca2fe295
ED
1845 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1846 * thtail->fin, so that the fast path in tcp_rcv_established()
1847 * is not entered if we append a packet with a FIN.
1848 * SYN, RST, URG are not present.
1849 * ACK is set on both packets.
1850 * PSH : we do not really care in TCP stack,
1851 * at least for 'GRO' packets.
1852 */
1853 thtail->fin |= th->fin;
4f693b55
ED
1854 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1855
1856 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1857 TCP_SKB_CB(tail)->has_rxtstamp = true;
1858 tail->tstamp = skb->tstamp;
1859 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1860 }
1861
1862 /* Not as strict as GRO. We only need to carry mss max value */
b160c285
ED
1863 shinfo->gso_size = max(gso_size, tail_gso_size);
1864 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
4f693b55
ED
1865
1866 sk->sk_backlog.len += delta;
1867 __NET_INC_STATS(sock_net(sk),
1868 LINUX_MIB_TCPBACKLOGCOALESCE);
1869 kfree_skb_partial(skb, fragstolen);
1870 return false;
1871 }
1872 __skb_push(skb, hdrlen);
1873
1874no_coalesce:
1875 /* Only socket owner can try to collapse/prune rx queues
1876 * to reduce memory overhead, so add a little headroom here.
1877 * Few sockets backlog are possibly concurrently non empty.
1878 */
d519f350 1879 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
4f693b55 1880
c9c33212
ED
1881 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1882 bh_unlock_sock(sk);
7a26dc9e 1883 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
c9c33212
ED
1884 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1885 return true;
1886 }
1887 return false;
1888}
1889EXPORT_SYMBOL(tcp_add_backlog);
1890
ac6e7800
ED
1891int tcp_filter(struct sock *sk, struct sk_buff *skb)
1892{
1893 struct tcphdr *th = (struct tcphdr *)skb->data;
ac6e7800 1894
f2feaefd 1895 return sk_filter_trim_cap(sk, skb, th->doff * 4);
ac6e7800
ED
1896}
1897EXPORT_SYMBOL(tcp_filter);
1898
eeea10b8
ED
1899static void tcp_v4_restore_cb(struct sk_buff *skb)
1900{
1901 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1902 sizeof(struct inet_skb_parm));
1903}
1904
1905static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1906 const struct tcphdr *th)
1907{
1908 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1909 * barrier() makes sure compiler wont play fool^Waliasing games.
1910 */
1911 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1912 sizeof(struct inet_skb_parm));
1913 barrier();
1914
1915 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1916 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1917 skb->len - th->doff * 4);
1918 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1919 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1920 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1921 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1922 TCP_SKB_CB(skb)->sacked = 0;
1923 TCP_SKB_CB(skb)->has_rxtstamp =
1924 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1925}
1926
1da177e4
LT
1927/*
1928 * From tcp_input.c
1929 */
1930
1931int tcp_v4_rcv(struct sk_buff *skb)
1932{
3b24d854 1933 struct net *net = dev_net(skb->dev);
643b622b 1934 enum skb_drop_reason drop_reason;
3fa6f616 1935 int sdif = inet_sdif(skb);
534322ca 1936 int dif = inet_iif(skb);
eddc9ec5 1937 const struct iphdr *iph;
cf533ea5 1938 const struct tcphdr *th;
3b24d854 1939 bool refcounted;
1da177e4
LT
1940 struct sock *sk;
1941 int ret;
1942
85125597 1943 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1da177e4
LT
1944 if (skb->pkt_type != PACKET_HOST)
1945 goto discard_it;
1946
1947 /* Count it even if it's bad */
90bbcc60 1948 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1da177e4
LT
1949
1950 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1951 goto discard_it;
1952
ea1627c2 1953 th = (const struct tcphdr *)skb->data;
1da177e4 1954
85125597
MD
1955 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1956 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1da177e4 1957 goto bad_packet;
85125597 1958 }
1da177e4
LT
1959 if (!pskb_may_pull(skb, th->doff * 4))
1960 goto discard_it;
1961
1962 /* An explanation is required here, I think.
1963 * Packet length and doff are validated by header prediction,
caa20d9a 1964 * provided case of th->doff==0 is eliminated.
1da177e4 1965 * So, we defer the checks. */
ed70fcfc
TH
1966
1967 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
6a5dc9e5 1968 goto csum_error;
1da177e4 1969
ea1627c2 1970 th = (const struct tcphdr *)skb->data;
eddc9ec5 1971 iph = ip_hdr(skb);
4bdc3d66 1972lookup:
a583636a 1973 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
3fa6f616 1974 th->dest, sdif, &refcounted);
1da177e4
LT
1975 if (!sk)
1976 goto no_tcp_socket;
1977
bb134d5d
ED
1978process:
1979 if (sk->sk_state == TCP_TIME_WAIT)
1980 goto do_time_wait;
1981
079096f1
ED
1982 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1983 struct request_sock *req = inet_reqsk(sk);
e0f9759f 1984 bool req_stolen = false;
7716682c 1985 struct sock *nsk;
079096f1
ED
1986
1987 sk = req->rsk_listener;
6f0012e3
ED
1988 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1989 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1990 else
1991 drop_reason = tcp_inbound_md5_hash(sk, skb,
1330b6ef
JK
1992 &iph->saddr, &iph->daddr,
1993 AF_INET, dif, sdif);
1994 if (unlikely(drop_reason)) {
e65c332d 1995 sk_drops_add(sk, skb);
72923555
ED
1996 reqsk_put(req);
1997 goto discard_it;
1998 }
4fd44a98
FL
1999 if (tcp_checksum_complete(skb)) {
2000 reqsk_put(req);
2001 goto csum_error;
2002 }
7716682c 2003 if (unlikely(sk->sk_state != TCP_LISTEN)) {
d4f2c86b
KI
2004 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2005 if (!nsk) {
2006 inet_csk_reqsk_queue_drop_and_put(sk, req);
2007 goto lookup;
2008 }
2009 sk = nsk;
2010 /* reuseport_migrate_sock() has already held one sk_refcnt
2011 * before returning.
2012 */
2013 } else {
2014 /* We own a reference on the listener, increase it again
2015 * as we might lose it too soon.
2016 */
2017 sock_hold(sk);
4bdc3d66 2018 }
3b24d854 2019 refcounted = true;
1f3b359f 2020 nsk = NULL;
eeea10b8
ED
2021 if (!tcp_filter(sk, skb)) {
2022 th = (const struct tcphdr *)skb->data;
2023 iph = ip_hdr(skb);
2024 tcp_v4_fill_cb(skb, iph, th);
e0f9759f 2025 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
255f9034
MD
2026 } else {
2027 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
eeea10b8 2028 }
079096f1
ED
2029 if (!nsk) {
2030 reqsk_put(req);
e0f9759f
ED
2031 if (req_stolen) {
2032 /* Another cpu got exclusive access to req
2033 * and created a full blown socket.
2034 * Try to feed this packet to this socket
2035 * instead of discarding it.
2036 */
2037 tcp_v4_restore_cb(skb);
2038 sock_put(sk);
2039 goto lookup;
2040 }
7716682c 2041 goto discard_and_relse;
079096f1 2042 }
6f0012e3 2043 nf_reset_ct(skb);
079096f1 2044 if (nsk == sk) {
079096f1 2045 reqsk_put(req);
eeea10b8 2046 tcp_v4_restore_cb(skb);
079096f1
ED
2047 } else if (tcp_child_process(sk, nsk, skb)) {
2048 tcp_v4_send_reset(nsk, skb);
7716682c 2049 goto discard_and_relse;
079096f1 2050 } else {
7716682c 2051 sock_put(sk);
079096f1
ED
2052 return 0;
2053 }
2054 }
14834c4f 2055
020e71a3
ED
2056 if (static_branch_unlikely(&ip4_min_ttl)) {
2057 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2058 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2059 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2060 goto discard_and_relse;
2061 }
6cce09f8 2062 }
d218d111 2063
255f9034
MD
2064 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2065 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1da177e4 2066 goto discard_and_relse;
255f9034 2067 }
9ea88a15 2068
1330b6ef
JK
2069 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2070 &iph->daddr, AF_INET, dif, sdif);
2071 if (drop_reason)
9ea88a15 2072 goto discard_and_relse;
9ea88a15 2073
895b5c9f 2074 nf_reset_ct(skb);
1da177e4 2075
85125597 2076 if (tcp_filter(sk, skb)) {
364df53c 2077 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
1da177e4 2078 goto discard_and_relse;
85125597 2079 }
ac6e7800
ED
2080 th = (const struct tcphdr *)skb->data;
2081 iph = ip_hdr(skb);
eeea10b8 2082 tcp_v4_fill_cb(skb, iph, th);
1da177e4
LT
2083
2084 skb->dev = NULL;
2085
e994b2f0
ED
2086 if (sk->sk_state == TCP_LISTEN) {
2087 ret = tcp_v4_do_rcv(sk, skb);
2088 goto put_and_return;
2089 }
2090
2091 sk_incoming_cpu_update(sk);
2092
c6366184 2093 bh_lock_sock_nested(sk);
a44d6eac 2094 tcp_segs_in(tcp_sk(sk), skb);
1da177e4
LT
2095 ret = 0;
2096 if (!sock_owned_by_user(sk)) {
e7942d06 2097 ret = tcp_v4_do_rcv(sk, skb);
8b27dae5 2098 } else {
7a26dc9e 2099 if (tcp_add_backlog(sk, skb, &drop_reason))
8b27dae5 2100 goto discard_and_relse;
6b03a53a 2101 }
1da177e4
LT
2102 bh_unlock_sock(sk);
2103
e994b2f0 2104put_and_return:
3b24d854
ED
2105 if (refcounted)
2106 sock_put(sk);
1da177e4
LT
2107
2108 return ret;
2109
2110no_tcp_socket:
85125597 2111 drop_reason = SKB_DROP_REASON_NO_SOCKET;
1da177e4
LT
2112 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2113 goto discard_it;
2114
eeea10b8
ED
2115 tcp_v4_fill_cb(skb, iph, th);
2116
12e25e10 2117 if (tcp_checksum_complete(skb)) {
6a5dc9e5 2118csum_error:
85125597 2119 drop_reason = SKB_DROP_REASON_TCP_CSUM;
709c0314 2120 trace_tcp_bad_csum(skb);
90bbcc60 2121 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1da177e4 2122bad_packet:
90bbcc60 2123 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1da177e4 2124 } else {
cfb6eeb4 2125 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
2126 }
2127
2128discard_it:
f8319dfd 2129 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
1da177e4 2130 /* Discard frame. */
85125597 2131 kfree_skb_reason(skb, drop_reason);
e905a9ed 2132 return 0;
1da177e4
LT
2133
2134discard_and_relse:
532182cd 2135 sk_drops_add(sk, skb);
3b24d854
ED
2136 if (refcounted)
2137 sock_put(sk);
1da177e4
LT
2138 goto discard_it;
2139
2140do_time_wait:
2141 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
255f9034 2142 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
9469c7b4 2143 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
2144 goto discard_it;
2145 }
2146
eeea10b8
ED
2147 tcp_v4_fill_cb(skb, iph, th);
2148
6a5dc9e5
ED
2149 if (tcp_checksum_complete(skb)) {
2150 inet_twsk_put(inet_twsk(sk));
2151 goto csum_error;
1da177e4 2152 }
9469c7b4 2153 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 2154 case TCP_TW_SYN: {
c346dca1 2155 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
a583636a
CG
2156 &tcp_hashinfo, skb,
2157 __tcp_hdrlen(th),
da5e3630 2158 iph->saddr, th->source,
eddc9ec5 2159 iph->daddr, th->dest,
3fa6f616
DA
2160 inet_iif(skb),
2161 sdif);
1da177e4 2162 if (sk2) {
dbe7faa4 2163 inet_twsk_deschedule_put(inet_twsk(sk));
1da177e4 2164 sk = sk2;
eeea10b8 2165 tcp_v4_restore_cb(skb);
3b24d854 2166 refcounted = false;
1da177e4
LT
2167 goto process;
2168 }
1da177e4 2169 }
fcfd6dfa 2170 /* to ACK */
a8eceea8 2171 fallthrough;
1da177e4
LT
2172 case TCP_TW_ACK:
2173 tcp_v4_timewait_ack(sk, skb);
2174 break;
2175 case TCP_TW_RST:
271c3b9b
FW
2176 tcp_v4_send_reset(sk, skb);
2177 inet_twsk_deschedule_put(inet_twsk(sk));
2178 goto discard_it;
1da177e4
LT
2179 case TCP_TW_SUCCESS:;
2180 }
2181 goto discard_it;
2182}
2183
ccb7c410
DM
2184static struct timewait_sock_ops tcp_timewait_sock_ops = {
2185 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2186 .twsk_unique = tcp_twsk_unique,
2187 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 2188};
1da177e4 2189
63d02d15 2190void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
2191{
2192 struct dst_entry *dst = skb_dst(skb);
2193
5037e9ef 2194 if (dst && dst_hold_safe(dst)) {
8f905c0e 2195 rcu_assign_pointer(sk->sk_rx_dst, dst);
0c0a5ef8 2196 sk->sk_rx_dst_ifindex = skb->skb_iif;
ca777eff 2197 }
5d299f3d 2198}
63d02d15 2199EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 2200
3b401a81 2201const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
2202 .queue_xmit = ip_queue_xmit,
2203 .send_check = tcp_v4_send_check,
2204 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 2205 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
2206 .conn_request = tcp_v4_conn_request,
2207 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
2208 .net_header_len = sizeof(struct iphdr),
2209 .setsockopt = ip_setsockopt,
2210 .getsockopt = ip_getsockopt,
2211 .addr2sockaddr = inet_csk_addr2sockaddr,
2212 .sockaddr_len = sizeof(struct sockaddr_in),
4fab9071 2213 .mtu_reduced = tcp_v4_mtu_reduced,
1da177e4 2214};
4bc2f18b 2215EXPORT_SYMBOL(ipv4_specific);
1da177e4 2216
cfb6eeb4 2217#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 2218static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 2219 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 2220 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 2221 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 2222};
b6332e6c 2223#endif
cfb6eeb4 2224
1da177e4
LT
2225/* NOTE: A lot of things set to zero explicitly by call to
2226 * sk_alloc() so need not be done here.
2227 */
2228static int tcp_v4_init_sock(struct sock *sk)
2229{
6687e988 2230 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 2231
900f65d3 2232 tcp_init_sock(sk);
1da177e4 2233
8292a17a 2234 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 2235
cfb6eeb4 2236#ifdef CONFIG_TCP_MD5SIG
ac807fa8 2237 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 2238#endif
1da177e4 2239
1da177e4
LT
2240 return 0;
2241}
2242
7d06b2e0 2243void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
2244{
2245 struct tcp_sock *tp = tcp_sk(sk);
2246
e1a4aa50
SL
2247 trace_tcp_destroy_sock(sk);
2248
1da177e4
LT
2249 tcp_clear_xmit_timers(sk);
2250
6687e988 2251 tcp_cleanup_congestion_control(sk);
317a76f9 2252
734942cc
DW
2253 tcp_cleanup_ulp(sk);
2254
1da177e4 2255 /* Cleanup up the write buffer. */
fe067e8a 2256 tcp_write_queue_purge(sk);
1da177e4 2257
cf1ef3f0
WW
2258 /* Check if we want to disable active TFO */
2259 tcp_fastopen_active_disable_ofo_check(sk);
2260
1da177e4 2261 /* Cleans up our, hopefully empty, out_of_order_queue. */
9f5afeae 2262 skb_rbtree_purge(&tp->out_of_order_queue);
1da177e4 2263
cfb6eeb4
YH
2264#ifdef CONFIG_TCP_MD5SIG
2265 /* Clean up the MD5 key list, if any */
2266 if (tp->md5sig_info) {
a915da9b 2267 tcp_clear_md5_list(sk);
fb7df5e4 2268 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
cfb6eeb4
YH
2269 tp->md5sig_info = NULL;
2270 }
2271#endif
1a2449a8 2272
1da177e4 2273 /* Clean up a referenced TCP bind bucket. */
463c84b9 2274 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 2275 inet_put_port(sk);
1da177e4 2276
d983ea6f 2277 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
435cf559 2278
cf60af03
YC
2279 /* If socket is aborted during connect operation */
2280 tcp_free_fastopen_req(tp);
1fba70e5 2281 tcp_fastopen_destroy_cipher(sk);
cd8ae852 2282 tcp_saved_syn_free(tp);
cf60af03 2283
180d8cd9 2284 sk_sockets_allocated_dec(sk);
1da177e4 2285}
1da177e4
LT
2286EXPORT_SYMBOL(tcp_v4_destroy_sock);
2287
2288#ifdef CONFIG_PROC_FS
2289/* Proc filesystem TCP sock list dumping. */
2290
ad2d6137
MKL
2291static unsigned short seq_file_family(const struct seq_file *seq);
2292
2293static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2294{
2295 unsigned short family = seq_file_family(seq);
2296
2297 /* AF_UNSPEC is used as a match all */
2298 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2299 net_eq(sock_net(sk), seq_file_net(seq)));
2300}
2301
b72acf45
MKL
2302/* Find a non empty bucket (starting from st->bucket)
2303 * and return the first sk from it.
a8b690f9 2304 */
b72acf45 2305static void *listening_get_first(struct seq_file *seq)
1da177e4 2306{
5799de0b 2307 struct tcp_iter_state *st = seq->private;
1da177e4 2308
b72acf45 2309 st->offset = 0;
05c0b357
MKL
2310 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2311 struct inet_listen_hashbucket *ilb2;
cae3873c 2312 struct hlist_nulls_node *node;
b72acf45 2313 struct sock *sk;
b08d4d3b 2314
05c0b357 2315 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
cae3873c 2316 if (hlist_nulls_empty(&ilb2->nulls_head))
b72acf45
MKL
2317 continue;
2318
05c0b357 2319 spin_lock(&ilb2->lock);
cae3873c 2320 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
b72acf45
MKL
2321 if (seq_sk_match(seq, sk))
2322 return sk;
2323 }
05c0b357 2324 spin_unlock(&ilb2->lock);
1da177e4 2325 }
b72acf45
MKL
2326
2327 return NULL;
2328}
2329
2330/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2331 * If "cur" is the last one in the st->bucket,
2332 * call listening_get_first() to return the first sk of the next
2333 * non empty bucket.
a8b690f9 2334 */
1da177e4
LT
2335static void *listening_get_next(struct seq_file *seq, void *cur)
2336{
5799de0b 2337 struct tcp_iter_state *st = seq->private;
05c0b357 2338 struct inet_listen_hashbucket *ilb2;
cae3873c 2339 struct hlist_nulls_node *node;
3b24d854 2340 struct sock *sk = cur;
1da177e4 2341
1da177e4 2342 ++st->num;
a8b690f9 2343 ++st->offset;
1da177e4 2344
cae3873c
MKL
2345 sk = sk_nulls_next(sk);
2346 sk_nulls_for_each_from(sk, node) {
ad2d6137 2347 if (seq_sk_match(seq, sk))
3b24d854 2348 return sk;
1da177e4 2349 }
b72acf45 2350
05c0b357
MKL
2351 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2352 spin_unlock(&ilb2->lock);
b72acf45
MKL
2353 ++st->bucket;
2354 return listening_get_first(seq);
1da177e4
LT
2355}
2356
2357static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2358{
a8b690f9
TH
2359 struct tcp_iter_state *st = seq->private;
2360 void *rc;
2361
2362 st->bucket = 0;
2363 st->offset = 0;
b72acf45 2364 rc = listening_get_first(seq);
1da177e4
LT
2365
2366 while (rc && *pos) {
2367 rc = listening_get_next(seq, rc);
2368 --*pos;
2369 }
2370 return rc;
2371}
2372
05dbc7b5 2373static inline bool empty_bucket(const struct tcp_iter_state *st)
6eac5604 2374{
05dbc7b5 2375 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
6eac5604
AK
2376}
2377
a8b690f9
TH
2378/*
2379 * Get first established socket starting from bucket given in st->bucket.
2380 * If st->bucket is zero, the very first socket in the hash is returned.
2381 */
1da177e4
LT
2382static void *established_get_first(struct seq_file *seq)
2383{
5799de0b 2384 struct tcp_iter_state *st = seq->private;
b08d4d3b 2385
a8b690f9
TH
2386 st->offset = 0;
2387 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 2388 struct sock *sk;
3ab5aee7 2389 struct hlist_nulls_node *node;
9db66bdc 2390 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 2391
6eac5604
AK
2392 /* Lockless fast path for the common case of empty buckets */
2393 if (empty_bucket(st))
2394 continue;
2395
9db66bdc 2396 spin_lock_bh(lock);
3ab5aee7 2397 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
ad2d6137
MKL
2398 if (seq_sk_match(seq, sk))
2399 return sk;
1da177e4 2400 }
9db66bdc 2401 spin_unlock_bh(lock);
1da177e4 2402 }
ad2d6137
MKL
2403
2404 return NULL;
1da177e4
LT
2405}
2406
2407static void *established_get_next(struct seq_file *seq, void *cur)
2408{
5799de0b 2409 struct tcp_iter_state *st = seq->private;
08eaef90
KI
2410 struct hlist_nulls_node *node;
2411 struct sock *sk = cur;
b08d4d3b 2412
1da177e4 2413 ++st->num;
a8b690f9 2414 ++st->offset;
1da177e4 2415
05dbc7b5 2416 sk = sk_nulls_next(sk);
1da177e4 2417
3ab5aee7 2418 sk_nulls_for_each_from(sk, node) {
ad2d6137 2419 if (seq_sk_match(seq, sk))
05dbc7b5 2420 return sk;
1da177e4
LT
2421 }
2422
05dbc7b5
ED
2423 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2424 ++st->bucket;
2425 return established_get_first(seq);
1da177e4
LT
2426}
2427
2428static void *established_get_idx(struct seq_file *seq, loff_t pos)
2429{
a8b690f9
TH
2430 struct tcp_iter_state *st = seq->private;
2431 void *rc;
2432
2433 st->bucket = 0;
2434 rc = established_get_first(seq);
1da177e4
LT
2435
2436 while (rc && pos) {
2437 rc = established_get_next(seq, rc);
2438 --pos;
7174259e 2439 }
1da177e4
LT
2440 return rc;
2441}
2442
2443static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2444{
2445 void *rc;
5799de0b 2446 struct tcp_iter_state *st = seq->private;
1da177e4 2447
1da177e4
LT
2448 st->state = TCP_SEQ_STATE_LISTENING;
2449 rc = listening_get_idx(seq, &pos);
2450
2451 if (!rc) {
1da177e4
LT
2452 st->state = TCP_SEQ_STATE_ESTABLISHED;
2453 rc = established_get_idx(seq, pos);
2454 }
2455
2456 return rc;
2457}
2458
a8b690f9
TH
2459static void *tcp_seek_last_pos(struct seq_file *seq)
2460{
2461 struct tcp_iter_state *st = seq->private;
525e2f9f 2462 int bucket = st->bucket;
a8b690f9
TH
2463 int offset = st->offset;
2464 int orig_num = st->num;
2465 void *rc = NULL;
2466
2467 switch (st->state) {
a8b690f9 2468 case TCP_SEQ_STATE_LISTENING:
05c0b357 2469 if (st->bucket > tcp_hashinfo.lhash2_mask)
a8b690f9
TH
2470 break;
2471 st->state = TCP_SEQ_STATE_LISTENING;
b72acf45 2472 rc = listening_get_first(seq);
525e2f9f 2473 while (offset-- && rc && bucket == st->bucket)
a8b690f9
TH
2474 rc = listening_get_next(seq, rc);
2475 if (rc)
2476 break;
2477 st->bucket = 0;
05dbc7b5 2478 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8eceea8 2479 fallthrough;
a8b690f9 2480 case TCP_SEQ_STATE_ESTABLISHED:
a8b690f9
TH
2481 if (st->bucket > tcp_hashinfo.ehash_mask)
2482 break;
2483 rc = established_get_first(seq);
525e2f9f 2484 while (offset-- && rc && bucket == st->bucket)
a8b690f9
TH
2485 rc = established_get_next(seq, rc);
2486 }
2487
2488 st->num = orig_num;
2489
2490 return rc;
2491}
2492
37d849bb 2493void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1da177e4 2494{
5799de0b 2495 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2496 void *rc;
2497
2498 if (*pos && *pos == st->last_pos) {
2499 rc = tcp_seek_last_pos(seq);
2500 if (rc)
2501 goto out;
2502 }
2503
1da177e4
LT
2504 st->state = TCP_SEQ_STATE_LISTENING;
2505 st->num = 0;
a8b690f9
TH
2506 st->bucket = 0;
2507 st->offset = 0;
2508 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2509
2510out:
2511 st->last_pos = *pos;
2512 return rc;
1da177e4 2513}
37d849bb 2514EXPORT_SYMBOL(tcp_seq_start);
1da177e4 2515
37d849bb 2516void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1da177e4 2517{
a8b690f9 2518 struct tcp_iter_state *st = seq->private;
1da177e4 2519 void *rc = NULL;
1da177e4
LT
2520
2521 if (v == SEQ_START_TOKEN) {
2522 rc = tcp_get_idx(seq, 0);
2523 goto out;
2524 }
1da177e4
LT
2525
2526 switch (st->state) {
1da177e4
LT
2527 case TCP_SEQ_STATE_LISTENING:
2528 rc = listening_get_next(seq, v);
2529 if (!rc) {
1da177e4 2530 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2531 st->bucket = 0;
2532 st->offset = 0;
1da177e4
LT
2533 rc = established_get_first(seq);
2534 }
2535 break;
2536 case TCP_SEQ_STATE_ESTABLISHED:
1da177e4
LT
2537 rc = established_get_next(seq, v);
2538 break;
2539 }
2540out:
2541 ++*pos;
a8b690f9 2542 st->last_pos = *pos;
1da177e4
LT
2543 return rc;
2544}
37d849bb 2545EXPORT_SYMBOL(tcp_seq_next);
1da177e4 2546
37d849bb 2547void tcp_seq_stop(struct seq_file *seq, void *v)
1da177e4 2548{
5799de0b 2549 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2550
2551 switch (st->state) {
1da177e4
LT
2552 case TCP_SEQ_STATE_LISTENING:
2553 if (v != SEQ_START_TOKEN)
05c0b357 2554 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
1da177e4 2555 break;
1da177e4
LT
2556 case TCP_SEQ_STATE_ESTABLISHED:
2557 if (v)
9db66bdc 2558 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2559 break;
2560 }
2561}
37d849bb 2562EXPORT_SYMBOL(tcp_seq_stop);
1da177e4 2563
d4f06873 2564static void get_openreq4(const struct request_sock *req,
aa3a0c8c 2565 struct seq_file *f, int i)
1da177e4 2566{
2e6599cb 2567 const struct inet_request_sock *ireq = inet_rsk(req);
fa76ce73 2568 long delta = req->rsk_timer.expires - jiffies;
1da177e4 2569
5e659e4c 2570 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2571 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
1da177e4 2572 i,
634fb979 2573 ireq->ir_loc_addr,
d4f06873 2574 ireq->ir_num,
634fb979
ED
2575 ireq->ir_rmt_addr,
2576 ntohs(ireq->ir_rmt_port),
1da177e4
LT
2577 TCP_SYN_RECV,
2578 0, 0, /* could print option size, but that is af dependent. */
2579 1, /* timers active (only the expire timer) */
a399a805 2580 jiffies_delta_to_clock_t(delta),
e6c022a4 2581 req->num_timeout,
aa3a0c8c
ED
2582 from_kuid_munged(seq_user_ns(f),
2583 sock_i_uid(req->rsk_listener)),
1da177e4
LT
2584 0, /* non standard timer */
2585 0, /* open_requests have no inode */
d4f06873 2586 0,
652586df 2587 req);
1da177e4
LT
2588}
2589
652586df 2590static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
1da177e4
LT
2591{
2592 int timer_active;
2593 unsigned long timer_expires;
cf533ea5 2594 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2595 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2596 const struct inet_sock *inet = inet_sk(sk);
0536fcc0 2597 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2598 __be32 dest = inet->inet_daddr;
2599 __be32 src = inet->inet_rcv_saddr;
2600 __u16 destp = ntohs(inet->inet_dport);
2601 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2602 int rx_queue;
00fd38d9 2603 int state;
1da177e4 2604
6ba8a3b1 2605 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
57dde7f7 2606 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
6ba8a3b1 2607 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2608 timer_active = 1;
463c84b9
ACM
2609 timer_expires = icsk->icsk_timeout;
2610 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2611 timer_active = 4;
463c84b9 2612 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2613 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2614 timer_active = 2;
cf4c6bf8 2615 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2616 } else {
2617 timer_active = 0;
2618 timer_expires = jiffies;
2619 }
2620
986ffdfd 2621 state = inet_sk_state_load(sk);
00fd38d9 2622 if (state == TCP_LISTEN)
288efe86 2623 rx_queue = READ_ONCE(sk->sk_ack_backlog);
49d09007 2624 else
00fd38d9
ED
2625 /* Because we don't lock the socket,
2626 * we might find a transient negative value.
49d09007 2627 */
dba7d9b8 2628 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
7db48e98 2629 READ_ONCE(tp->copied_seq), 0);
49d09007 2630
5e659e4c 2631 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
652586df 2632 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
00fd38d9 2633 i, src, srcp, dest, destp, state,
0f317464 2634 READ_ONCE(tp->write_seq) - tp->snd_una,
49d09007 2635 rx_queue,
1da177e4 2636 timer_active,
a399a805 2637 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2638 icsk->icsk_retransmits,
a7cb5a49 2639 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2640 icsk->icsk_probes_out,
cf4c6bf8 2641 sock_i_ino(sk),
41c6d650 2642 refcount_read(&sk->sk_refcnt), sk,
7be87351
SH
2643 jiffies_to_clock_t(icsk->icsk_rto),
2644 jiffies_to_clock_t(icsk->icsk_ack.ato),
31954cd8 2645 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
40570375 2646 tcp_snd_cwnd(tp),
00fd38d9
ED
2647 state == TCP_LISTEN ?
2648 fastopenq->max_qlen :
652586df 2649 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
1da177e4
LT
2650}
2651
cf533ea5 2652static void get_timewait4_sock(const struct inet_timewait_sock *tw,
652586df 2653 struct seq_file *f, int i)
1da177e4 2654{
789f558c 2655 long delta = tw->tw_timer.expires - jiffies;
23f33c2d 2656 __be32 dest, src;
1da177e4 2657 __u16 destp, srcp;
1da177e4
LT
2658
2659 dest = tw->tw_daddr;
2660 src = tw->tw_rcv_saddr;
2661 destp = ntohs(tw->tw_dport);
2662 srcp = ntohs(tw->tw_sport);
2663
5e659e4c 2664 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2665 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
1da177e4 2666 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2667 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
41c6d650 2668 refcount_read(&tw->tw_refcnt), tw);
1da177e4
LT
2669}
2670
2671#define TMPSZ 150
2672
2673static int tcp4_seq_show(struct seq_file *seq, void *v)
2674{
5799de0b 2675 struct tcp_iter_state *st;
05dbc7b5 2676 struct sock *sk = v;
1da177e4 2677
652586df 2678 seq_setwidth(seq, TMPSZ - 1);
1da177e4 2679 if (v == SEQ_START_TOKEN) {
652586df 2680 seq_puts(seq, " sl local_address rem_address st tx_queue "
1da177e4
LT
2681 "rx_queue tr tm->when retrnsmt uid timeout "
2682 "inode");
2683 goto out;
2684 }
2685 st = seq->private;
2686
079096f1
ED
2687 if (sk->sk_state == TCP_TIME_WAIT)
2688 get_timewait4_sock(v, seq, st->num);
2689 else if (sk->sk_state == TCP_NEW_SYN_RECV)
aa3a0c8c 2690 get_openreq4(v, seq, st->num);
079096f1
ED
2691 else
2692 get_tcp4_sock(v, seq, st->num);
1da177e4 2693out:
652586df 2694 seq_pad(seq, '\n');
1da177e4
LT
2695 return 0;
2696}
2697
52d87d5f 2698#ifdef CONFIG_BPF_SYSCALL
04c7820b
MKL
2699struct bpf_tcp_iter_state {
2700 struct tcp_iter_state state;
2701 unsigned int cur_sk;
2702 unsigned int end_sk;
2703 unsigned int max_sk;
2704 struct sock **batch;
2705 bool st_bucket_done;
2706};
2707
52d87d5f
YS
2708struct bpf_iter__tcp {
2709 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2710 __bpf_md_ptr(struct sock_common *, sk_common);
2711 uid_t uid __aligned(8);
2712};
2713
2714static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2715 struct sock_common *sk_common, uid_t uid)
2716{
2717 struct bpf_iter__tcp ctx;
2718
2719 meta->seq_num--; /* skip SEQ_START_TOKEN */
2720 ctx.meta = meta;
2721 ctx.sk_common = sk_common;
2722 ctx.uid = uid;
2723 return bpf_iter_run_prog(prog, &ctx);
2724}
2725
04c7820b
MKL
2726static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2727{
2728 while (iter->cur_sk < iter->end_sk)
2729 sock_put(iter->batch[iter->cur_sk++]);
2730}
2731
2732static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2733 unsigned int new_batch_sz)
2734{
2735 struct sock **new_batch;
2736
2737 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2738 GFP_USER | __GFP_NOWARN);
2739 if (!new_batch)
2740 return -ENOMEM;
2741
2742 bpf_iter_tcp_put_batch(iter);
2743 kvfree(iter->batch);
2744 iter->batch = new_batch;
2745 iter->max_sk = new_batch_sz;
2746
2747 return 0;
2748}
2749
2750static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2751 struct sock *start_sk)
2752{
2753 struct bpf_tcp_iter_state *iter = seq->private;
2754 struct tcp_iter_state *st = &iter->state;
cae3873c 2755 struct hlist_nulls_node *node;
04c7820b
MKL
2756 unsigned int expected = 1;
2757 struct sock *sk;
2758
2759 sock_hold(start_sk);
2760 iter->batch[iter->end_sk++] = start_sk;
2761
cae3873c
MKL
2762 sk = sk_nulls_next(start_sk);
2763 sk_nulls_for_each_from(sk, node) {
04c7820b
MKL
2764 if (seq_sk_match(seq, sk)) {
2765 if (iter->end_sk < iter->max_sk) {
2766 sock_hold(sk);
2767 iter->batch[iter->end_sk++] = sk;
2768 }
2769 expected++;
2770 }
2771 }
2772 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2773
2774 return expected;
2775}
2776
2777static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2778 struct sock *start_sk)
2779{
2780 struct bpf_tcp_iter_state *iter = seq->private;
2781 struct tcp_iter_state *st = &iter->state;
2782 struct hlist_nulls_node *node;
2783 unsigned int expected = 1;
2784 struct sock *sk;
2785
2786 sock_hold(start_sk);
2787 iter->batch[iter->end_sk++] = start_sk;
2788
2789 sk = sk_nulls_next(start_sk);
2790 sk_nulls_for_each_from(sk, node) {
2791 if (seq_sk_match(seq, sk)) {
2792 if (iter->end_sk < iter->max_sk) {
2793 sock_hold(sk);
2794 iter->batch[iter->end_sk++] = sk;
2795 }
2796 expected++;
2797 }
2798 }
2799 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2800
2801 return expected;
2802}
2803
2804static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2805{
2806 struct bpf_tcp_iter_state *iter = seq->private;
2807 struct tcp_iter_state *st = &iter->state;
2808 unsigned int expected;
2809 bool resized = false;
2810 struct sock *sk;
2811
2812 /* The st->bucket is done. Directly advance to the next
2813 * bucket instead of having the tcp_seek_last_pos() to skip
2814 * one by one in the current bucket and eventually find out
2815 * it has to advance to the next bucket.
2816 */
2817 if (iter->st_bucket_done) {
2818 st->offset = 0;
2819 st->bucket++;
2820 if (st->state == TCP_SEQ_STATE_LISTENING &&
2821 st->bucket > tcp_hashinfo.lhash2_mask) {
2822 st->state = TCP_SEQ_STATE_ESTABLISHED;
2823 st->bucket = 0;
2824 }
2825 }
2826
2827again:
2828 /* Get a new batch */
2829 iter->cur_sk = 0;
2830 iter->end_sk = 0;
2831 iter->st_bucket_done = false;
2832
2833 sk = tcp_seek_last_pos(seq);
2834 if (!sk)
2835 return NULL; /* Done */
2836
2837 if (st->state == TCP_SEQ_STATE_LISTENING)
2838 expected = bpf_iter_tcp_listening_batch(seq, sk);
2839 else
2840 expected = bpf_iter_tcp_established_batch(seq, sk);
2841
2842 if (iter->end_sk == expected) {
2843 iter->st_bucket_done = true;
2844 return sk;
2845 }
2846
2847 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2848 resized = true;
2849 goto again;
2850 }
2851
2852 return sk;
2853}
2854
2855static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2856{
2857 /* bpf iter does not support lseek, so it always
2858 * continue from where it was stop()-ped.
2859 */
2860 if (*pos)
2861 return bpf_iter_tcp_batch(seq);
2862
2863 return SEQ_START_TOKEN;
2864}
2865
2866static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2867{
2868 struct bpf_tcp_iter_state *iter = seq->private;
2869 struct tcp_iter_state *st = &iter->state;
2870 struct sock *sk;
2871
2872 /* Whenever seq_next() is called, the iter->cur_sk is
2873 * done with seq_show(), so advance to the next sk in
2874 * the batch.
2875 */
2876 if (iter->cur_sk < iter->end_sk) {
2877 /* Keeping st->num consistent in tcp_iter_state.
2878 * bpf_iter_tcp does not use st->num.
2879 * meta.seq_num is used instead.
2880 */
2881 st->num++;
2882 /* Move st->offset to the next sk in the bucket such that
2883 * the future start() will resume at st->offset in
2884 * st->bucket. See tcp_seek_last_pos().
2885 */
2886 st->offset++;
2887 sock_put(iter->batch[iter->cur_sk++]);
2888 }
2889
2890 if (iter->cur_sk < iter->end_sk)
2891 sk = iter->batch[iter->cur_sk];
2892 else
2893 sk = bpf_iter_tcp_batch(seq);
2894
2895 ++*pos;
2896 /* Keeping st->last_pos consistent in tcp_iter_state.
2897 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2898 */
2899 st->last_pos = *pos;
2900 return sk;
2901}
2902
52d87d5f
YS
2903static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2904{
2905 struct bpf_iter_meta meta;
2906 struct bpf_prog *prog;
2907 struct sock *sk = v;
04c7820b 2908 bool slow;
52d87d5f 2909 uid_t uid;
04c7820b 2910 int ret;
52d87d5f
YS
2911
2912 if (v == SEQ_START_TOKEN)
2913 return 0;
2914
04c7820b
MKL
2915 if (sk_fullsock(sk))
2916 slow = lock_sock_fast(sk);
2917
2918 if (unlikely(sk_unhashed(sk))) {
2919 ret = SEQ_SKIP;
2920 goto unlock;
2921 }
2922
52d87d5f
YS
2923 if (sk->sk_state == TCP_TIME_WAIT) {
2924 uid = 0;
2925 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2926 const struct request_sock *req = v;
2927
2928 uid = from_kuid_munged(seq_user_ns(seq),
2929 sock_i_uid(req->rsk_listener));
2930 } else {
2931 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2932 }
2933
2934 meta.seq = seq;
2935 prog = bpf_iter_get_info(&meta, false);
04c7820b
MKL
2936 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2937
2938unlock:
2939 if (sk_fullsock(sk))
2940 unlock_sock_fast(sk, slow);
2941 return ret;
2942
52d87d5f
YS
2943}
2944
2945static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2946{
04c7820b 2947 struct bpf_tcp_iter_state *iter = seq->private;
52d87d5f
YS
2948 struct bpf_iter_meta meta;
2949 struct bpf_prog *prog;
2950
2951 if (!v) {
2952 meta.seq = seq;
2953 prog = bpf_iter_get_info(&meta, true);
2954 if (prog)
2955 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2956 }
2957
04c7820b
MKL
2958 if (iter->cur_sk < iter->end_sk) {
2959 bpf_iter_tcp_put_batch(iter);
2960 iter->st_bucket_done = false;
2961 }
52d87d5f
YS
2962}
2963
2964static const struct seq_operations bpf_iter_tcp_seq_ops = {
2965 .show = bpf_iter_tcp_seq_show,
04c7820b
MKL
2966 .start = bpf_iter_tcp_seq_start,
2967 .next = bpf_iter_tcp_seq_next,
52d87d5f
YS
2968 .stop = bpf_iter_tcp_seq_stop,
2969};
2970#endif
ad2d6137
MKL
2971static unsigned short seq_file_family(const struct seq_file *seq)
2972{
62001372 2973 const struct tcp_seq_afinfo *afinfo;
ad2d6137 2974
62001372 2975#ifdef CONFIG_BPF_SYSCALL
ad2d6137 2976 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
62001372 2977 if (seq->op == &bpf_iter_tcp_seq_ops)
ad2d6137 2978 return AF_UNSPEC;
52d87d5f
YS
2979#endif
2980
ad2d6137 2981 /* Iterated from proc fs */
359745d7 2982 afinfo = pde_data(file_inode(seq->file));
ad2d6137
MKL
2983 return afinfo->family;
2984}
52d87d5f 2985
37d849bb
CH
2986static const struct seq_operations tcp4_seq_ops = {
2987 .show = tcp4_seq_show,
2988 .start = tcp_seq_start,
2989 .next = tcp_seq_next,
2990 .stop = tcp_seq_stop,
2991};
2992
1da177e4 2993static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4 2994 .family = AF_INET,
1da177e4
LT
2995};
2996
2c8c1e72 2997static int __net_init tcp4_proc_init_net(struct net *net)
757764f6 2998{
c3506372
CH
2999 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3000 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
37d849bb
CH
3001 return -ENOMEM;
3002 return 0;
757764f6
PE
3003}
3004
2c8c1e72 3005static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6 3006{
37d849bb 3007 remove_proc_entry("tcp", net->proc_net);
757764f6
PE
3008}
3009
3010static struct pernet_operations tcp4_net_ops = {
3011 .init = tcp4_proc_init_net,
3012 .exit = tcp4_proc_exit_net,
3013};
3014
1da177e4
LT
3015int __init tcp4_proc_init(void)
3016{
757764f6 3017 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
3018}
3019
3020void tcp4_proc_exit(void)
3021{
757764f6 3022 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
3023}
3024#endif /* CONFIG_PROC_FS */
3025
d3cd4924
ED
3026/* @wake is one when sk_stream_write_space() calls us.
3027 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3028 * This mimics the strategy used in sock_def_write_space().
3029 */
3030bool tcp_stream_memory_free(const struct sock *sk, int wake)
3031{
3032 const struct tcp_sock *tp = tcp_sk(sk);
3033 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3034 READ_ONCE(tp->snd_nxt);
3035
3036 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3037}
3038EXPORT_SYMBOL(tcp_stream_memory_free);
3039
1da177e4
LT
3040struct proto tcp_prot = {
3041 .name = "TCP",
3042 .owner = THIS_MODULE,
3043 .close = tcp_close,
d74bad4e 3044 .pre_connect = tcp_v4_pre_connect,
1da177e4
LT
3045 .connect = tcp_v4_connect,
3046 .disconnect = tcp_disconnect,
463c84b9 3047 .accept = inet_csk_accept,
1da177e4
LT
3048 .ioctl = tcp_ioctl,
3049 .init = tcp_v4_init_sock,
3050 .destroy = tcp_v4_destroy_sock,
3051 .shutdown = tcp_shutdown,
3052 .setsockopt = tcp_setsockopt,
3053 .getsockopt = tcp_getsockopt,
9cacf81f 3054 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
4b9d07a4 3055 .keepalive = tcp_set_keepalive,
1da177e4 3056 .recvmsg = tcp_recvmsg,
7ba42910
CG
3057 .sendmsg = tcp_sendmsg,
3058 .sendpage = tcp_sendpage,
1da177e4 3059 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 3060 .release_cb = tcp_release_cb,
ab1e0a13
ACM
3061 .hash = inet_hash,
3062 .unhash = inet_unhash,
3063 .get_port = inet_csk_get_port,
91a760b2 3064 .put_port = inet_put_port,
8a59f9d1
CW
3065#ifdef CONFIG_BPF_SYSCALL
3066 .psock_update_sk_prot = tcp_bpf_update_proto,
3067#endif
1da177e4 3068 .enter_memory_pressure = tcp_enter_memory_pressure,
06044751 3069 .leave_memory_pressure = tcp_leave_memory_pressure,
c9bee3b7 3070 .stream_memory_free = tcp_stream_memory_free,
1da177e4 3071 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 3072 .orphan_count = &tcp_orphan_count,
0defbb0a 3073
1da177e4 3074 .memory_allocated = &tcp_memory_allocated,
0defbb0a
ED
3075 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3076
1da177e4 3077 .memory_pressure = &tcp_memory_pressure,
a4fe34bf 3078 .sysctl_mem = sysctl_tcp_mem,
356d1833
ED
3079 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3080 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
1da177e4
LT
3081 .max_header = MAX_TCP_HEADER,
3082 .obj_size = sizeof(struct tcp_sock),
5f0d5a3a 3083 .slab_flags = SLAB_TYPESAFE_BY_RCU,
6d6ee43e 3084 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 3085 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 3086 .h.hashinfo = &tcp_hashinfo,
7ba42910 3087 .no_autobind = true,
c1e64e29 3088 .diag_destroy = tcp_abort,
1da177e4 3089};
4bc2f18b 3090EXPORT_SYMBOL(tcp_prot);
1da177e4 3091
bdbbb852
ED
3092static void __net_exit tcp_sk_exit(struct net *net)
3093{
b506bc97 3094 if (net->ipv4.tcp_congestion_control)
0baf26b0
MKL
3095 bpf_module_put(net->ipv4.tcp_congestion_control,
3096 net->ipv4.tcp_congestion_control->owner);
bdbbb852
ED
3097}
3098
046ee902
DL
3099static int __net_init tcp_sk_init(struct net *net)
3100{
37ba017d 3101 int cnt;
49213555 3102
5d134f1c 3103 net->ipv4.sysctl_tcp_ecn = 2;
49213555
DB
3104 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3105
b0f9ca53 3106 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
5f3e2bf0 3107 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
6b58e0a5 3108 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
05cbc0db 3109 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
c04b79b6 3110 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
046ee902 3111
13b287e8 3112 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
9bd6861b 3113 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
b840d15d 3114 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
13b287e8 3115
6fa25166 3116 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
7c083ecb 3117 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
0aca737d 3118 net->ipv4.sysctl_tcp_syncookies = 1;
1043e25f 3119 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
ae5c3f40 3120 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
c6214a97 3121 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
c402d9be 3122 net->ipv4.sysctl_tcp_orphan_retries = 0;
1e579caa 3123 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
4979f2d9 3124 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
79e9fed4 3125 net->ipv4.sysctl_tcp_tw_reuse = 2;
65e6d901 3126 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
12ed8244 3127
e9bd0cca 3128 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
fee83d09 3129 cnt = tcp_hashinfo.ehash_mask + 1;
e9bd0cca
KI
3130 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3131 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
1946e672 3132
623d0c2d 3133 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
f9301034 3134 net->ipv4.sysctl_tcp_sack = 1;
9bb37ef0 3135 net->ipv4.sysctl_tcp_window_scaling = 1;
5d2ed052 3136 net->ipv4.sysctl_tcp_timestamps = 1;
2ae21cf5 3137 net->ipv4.sysctl_tcp_early_retrans = 3;
e20223f1 3138 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
b510f0d2 3139 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
e0a1e5b5 3140 net->ipv4.sysctl_tcp_retrans_collapse = 1;
c6e21803 3141 net->ipv4.sysctl_tcp_max_reordering = 300;
6496f6bd 3142 net->ipv4.sysctl_tcp_dsack = 1;
0c12654a 3143 net->ipv4.sysctl_tcp_app_win = 31;
94f0893e 3144 net->ipv4.sysctl_tcp_adv_win_scale = 1;
af9b69a7 3145 net->ipv4.sysctl_tcp_frto = 2;
4540c0cf 3146 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
d06a9904
ED
3147 /* This limits the percentage of the congestion window which we
3148 * will allow a single TSO frame to consume. Building TSO frames
3149 * which are too large can cause TCP streams to be bursty.
3150 */
3151 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
c73e5807
ED
3152 /* Default TSQ limit of 16 TSO segments */
3153 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
79e3602c
ED
3154
3155 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3156 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3157
26e9596e 3158 net->ipv4.sysctl_tcp_min_tso_segs = 2;
65466904 3159 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
bd239704 3160 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
790f00e1 3161 net->ipv4.sysctl_tcp_autocorking = 1;
4170ba6b 3162 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
23a7102a 3163 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
c26e91f8 3164 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
356d1833
ED
3165 if (net != &init_net) {
3166 memcpy(net->ipv4.sysctl_tcp_rmem,
3167 init_net.ipv4.sysctl_tcp_rmem,
3168 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3169 memcpy(net->ipv4.sysctl_tcp_wmem,
3170 init_net.ipv4.sysctl_tcp_wmem,
3171 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3172 }
6d82aa24 3173 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
a70437cc 3174 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
9c21d2fc 3175 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
e1cfcbe8 3176 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
213ad73d 3177 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3733be14 3178 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
e1cfcbe8 3179
6670e152
SH
3180 /* Reno is always built in */
3181 if (!net_eq(net, &init_net) &&
0baf26b0
MKL
3182 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3183 init_net.ipv4.tcp_congestion_control->owner))
6670e152
SH
3184 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3185 else
3186 net->ipv4.tcp_congestion_control = &tcp_reno;
3187
49213555 3188 return 0;
b099ce26
EB
3189}
3190
3191static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3192{
43713848
HY
3193 struct net *net;
3194
04c494e6
ED
3195 inet_twsk_purge(&tcp_hashinfo, AF_INET);
3196
e9bd0cca
KI
3197 list_for_each_entry(net, net_exit_list, exit_list) {
3198 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
43713848 3199 tcp_fastopen_ctx_destroy(net);
e9bd0cca 3200 }
046ee902
DL
3201}
3202
3203static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
3204 .init = tcp_sk_init,
3205 .exit = tcp_sk_exit,
3206 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
3207};
3208
52d87d5f
YS
3209#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3210DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3211 struct sock_common *sk_common, uid_t uid)
3212
04c7820b
MKL
3213#define INIT_BATCH_SZ 16
3214
f9c79272 3215static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
52d87d5f 3216{
04c7820b
MKL
3217 struct bpf_tcp_iter_state *iter = priv_data;
3218 int err;
52d87d5f 3219
04c7820b
MKL
3220 err = bpf_iter_init_seq_net(priv_data, aux);
3221 if (err)
3222 return err;
52d87d5f 3223
04c7820b
MKL
3224 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3225 if (err) {
3226 bpf_iter_fini_seq_net(priv_data);
3227 return err;
3228 }
3229
3230 return 0;
52d87d5f
YS
3231}
3232
3233static void bpf_iter_fini_tcp(void *priv_data)
3234{
04c7820b 3235 struct bpf_tcp_iter_state *iter = priv_data;
52d87d5f 3236
52d87d5f 3237 bpf_iter_fini_seq_net(priv_data);
04c7820b 3238 kvfree(iter->batch);
52d87d5f
YS
3239}
3240
14fc6bd6 3241static const struct bpf_iter_seq_info tcp_seq_info = {
52d87d5f
YS
3242 .seq_ops = &bpf_iter_tcp_seq_ops,
3243 .init_seq_private = bpf_iter_init_tcp,
3244 .fini_seq_private = bpf_iter_fini_tcp,
04c7820b 3245 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
14fc6bd6
YS
3246};
3247
3cee6fb8
MKL
3248static const struct bpf_func_proto *
3249bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3250 const struct bpf_prog *prog)
3251{
3252 switch (func_id) {
3253 case BPF_FUNC_setsockopt:
3254 return &bpf_sk_setsockopt_proto;
3255 case BPF_FUNC_getsockopt:
3256 return &bpf_sk_getsockopt_proto;
3257 default:
3258 return NULL;
3259 }
3260}
3261
14fc6bd6
YS
3262static struct bpf_iter_reg tcp_reg_info = {
3263 .target = "tcp",
52d87d5f
YS
3264 .ctx_arg_info_size = 1,
3265 .ctx_arg_info = {
3266 { offsetof(struct bpf_iter__tcp, sk_common),
3267 PTR_TO_BTF_ID_OR_NULL },
3268 },
3cee6fb8 3269 .get_func_proto = bpf_iter_tcp_get_func_proto,
14fc6bd6 3270 .seq_info = &tcp_seq_info,
52d87d5f
YS
3271};
3272
3273static void __init bpf_iter_register(void)
3274{
951cf368 3275 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
52d87d5f
YS
3276 if (bpf_iter_reg_target(&tcp_reg_info))
3277 pr_warn("Warning: could not register bpf iterator tcp\n");
3278}
3279
3280#endif
3281
9b0f976f 3282void __init tcp_v4_init(void)
1da177e4 3283{
37ba017d
ED
3284 int cpu, res;
3285
3286 for_each_possible_cpu(cpu) {
3287 struct sock *sk;
3288
3289 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3290 IPPROTO_TCP, &init_net);
3291 if (res)
3292 panic("Failed to create the TCP control socket.\n");
3293 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3294
3295 /* Please enforce IP_DF and IPID==0 for RST and
3296 * ACK sent in SYN-RECV and TIME-WAIT state.
3297 */
3298 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3299
3300 per_cpu(ipv4_tcp_sk, cpu) = sk;
3301 }
6a1b3054 3302 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 3303 panic("Failed to create the TCP control socket.\n");
52d87d5f
YS
3304
3305#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3306 bpf_iter_register();
3307#endif
1da177e4 3308}