Merge tag 'trace-v6.2-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/trace...
[linux-block.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
1da177e4
LT
9 * IPv4 specific functions
10 *
1da177e4
LT
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
1da177e4
LT
17 */
18
19/*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
60236fdd 32 * request_sock handling and moved
1da177e4
LT
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
caa20d9a 35 * Added new listen semantics.
1da177e4
LT
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
afd46503 48#define pr_fmt(fmt) "TCP: " fmt
1da177e4 49
eb4dea58 50#include <linux/bottom_half.h>
1da177e4
LT
51#include <linux/types.h>
52#include <linux/fcntl.h>
53#include <linux/module.h>
54#include <linux/random.h>
55#include <linux/cache.h>
56#include <linux/jhash.h>
57#include <linux/init.h>
58#include <linux/times.h>
5a0e3ad6 59#include <linux/slab.h>
1da177e4 60
457c4cbc 61#include <net/net_namespace.h>
1da177e4 62#include <net/icmp.h>
304a1618 63#include <net/inet_hashtables.h>
1da177e4 64#include <net/tcp.h>
20380731 65#include <net/transp_v6.h>
1da177e4
LT
66#include <net/ipv6.h>
67#include <net/inet_common.h>
6d6ee43e 68#include <net/timewait_sock.h>
1da177e4 69#include <net/xfrm.h>
6e5714ea 70#include <net/secure_seq.h>
076bb0c8 71#include <net/busy_poll.h>
1da177e4
LT
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
6797318e 78#include <linux/inetdevice.h>
951cf368 79#include <linux/btf_ids.h>
1da177e4 80
cf80e0e4 81#include <crypto/hash.h>
cfb6eeb4
YH
82#include <linux/scatterlist.h>
83
c24b14c4
SL
84#include <trace/events/tcp.h>
85
cfb6eeb4 86#ifdef CONFIG_TCP_MD5SIG
a915da9b 87static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
89#endif
90
5caea4ea 91struct inet_hashinfo tcp_hashinfo;
4bc2f18b 92EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 93
37ba017d
ED
94static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95
84b114b9 96static u32 tcp_v4_init_seq(const struct sk_buff *skb)
1da177e4 97{
84b114b9
ED
98 return secure_tcp_seq(ip_hdr(skb)->daddr,
99 ip_hdr(skb)->saddr,
100 tcp_hdr(skb)->dest,
101 tcp_hdr(skb)->source);
102}
103
5d2ed052 104static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
84b114b9 105{
5d2ed052 106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
1da177e4
LT
107}
108
6d6ee43e
ACM
109int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110{
cbfc6495 111 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
79e9fed4 112 const struct inet_timewait_sock *tw = inet_twsk(sktw);
6d6ee43e
ACM
113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 struct tcp_sock *tp = tcp_sk(sk);
79e9fed4
115
116 if (reuse == 2) {
117 /* Still does not detect *everything* that goes through
118 * lo, since we require a loopback src or dst address
119 * or direct binding to 'lo' interface.
120 */
121 bool loopback = false;
122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123 loopback = true;
124#if IS_ENABLED(CONFIG_IPV6)
125 if (tw->tw_family == AF_INET6) {
126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
be2644aa 127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
79e9fed4 128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
be2644aa 129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
79e9fed4
130 loopback = true;
131 } else
132#endif
133 {
134 if (ipv4_is_loopback(tw->tw_daddr) ||
135 ipv4_is_loopback(tw->tw_rcv_saddr))
136 loopback = true;
137 }
138 if (!loopback)
139 reuse = 0;
140 }
6d6ee43e
ACM
141
142 /* With PAWS, it is safe from the viewpoint
143 of data integrity. Even without PAWS it is safe provided sequence
144 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145
146 Actually, the idea is close to VJ's one, only timestamp cache is
147 held not per host, but per port pair and TW bucket is used as state
148 holder.
149
150 If TW bucket has been already destroyed we fall back to VJ's scheme
151 and use initial timestamp retrieved from peer table.
152 */
153 if (tcptw->tw_ts_recent_stamp &&
cca9bab1
AB
154 (!twp || (reuse && time_after32(ktime_get_seconds(),
155 tcptw->tw_ts_recent_stamp)))) {
21684dc4
SB
156 /* In case of repair and re-using TIME-WAIT sockets we still
157 * want to be sure that it is safe as above but honor the
158 * sequence numbers and time stamps set as part of the repair
159 * process.
160 *
161 * Without this check re-using a TIME-WAIT socket with TCP
162 * repair would accumulate a -1 on the repair assigned
163 * sequence number. The first time it is reused the sequence
164 * is -1, the second time -2, etc. This fixes that issue
165 * without appearing to create any others.
166 */
167 if (likely(!tp->repair)) {
0f317464
ED
168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169
170 if (!seq)
171 seq = 1;
172 WRITE_ONCE(tp->write_seq, seq);
21684dc4
SB
173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175 }
6d6ee43e
ACM
176 sock_hold(sktw);
177 return 1;
178 }
179
180 return 0;
181}
6d6ee43e
ACM
182EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183
d74bad4e
AI
184static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 int addr_len)
186{
187 /* This check is replicated from tcp_v4_connect() and intended to
188 * prevent BPF program called below from accessing bytes that are out
189 * of the bound specified by user in addr_len.
190 */
191 if (addr_len < sizeof(struct sockaddr_in))
192 return -EINVAL;
193
194 sock_owned_by_me(sk);
195
196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197}
198
1da177e4
LT
199/* This will initiate an outgoing connection. */
200int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201{
2d7192d6 202 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
08eaef90 203 struct inet_timewait_death_row *tcp_death_row;
1da177e4
LT
204 struct inet_sock *inet = inet_sk(sk);
205 struct tcp_sock *tp = tcp_sk(sk);
08eaef90
KI
206 struct ip_options_rcu *inet_opt;
207 struct net *net = sock_net(sk);
dca8b089 208 __be16 orig_sport, orig_dport;
8c5dae4c 209 __be32 daddr, nexthop;
da905bd1 210 struct flowi4 *fl4;
2d7192d6 211 struct rtable *rt;
1da177e4
LT
212 int err;
213
214 if (addr_len < sizeof(struct sockaddr_in))
215 return -EINVAL;
216
217 if (usin->sin_family != AF_INET)
218 return -EAFNOSUPPORT;
219
220 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05 221 inet_opt = rcu_dereference_protected(inet->inet_opt,
1e1d04e6 222 lockdep_sock_is_held(sk));
f6d8bd05 223 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
224 if (!daddr)
225 return -EINVAL;
f6d8bd05 226 nexthop = inet_opt->opt.faddr;
1da177e4
LT
227 }
228
dca8b089
DM
229 orig_sport = inet->inet_sport;
230 orig_dport = usin->sin_port;
da905bd1
DM
231 fl4 = &inet->cork.fl.u.ip4;
232 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
67e1e2f4
GN
233 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
234 orig_dport, sk);
b23dd4fe
DM
235 if (IS_ERR(rt)) {
236 err = PTR_ERR(rt);
237 if (err == -ENETUNREACH)
08eaef90 238 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 239 return err;
584bdf8c 240 }
1da177e4
LT
241
242 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
243 ip_rt_put(rt);
244 return -ENETUNREACH;
245 }
246
f6d8bd05 247 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 248 daddr = fl4->daddr;
1da177e4 249
4461568a
KI
250 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
251
28044fc1 252 if (!inet->inet_saddr) {
8c5dae4c 253 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET);
28044fc1 254 if (err) {
28044fc1
JK
255 ip_rt_put(rt);
256 return err;
257 }
8c5dae4c
KI
258 } else {
259 sk_rcv_saddr_set(sk, inet->inet_saddr);
28044fc1
JK
260 }
261
c720c7e8 262 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
263 /* Reset inherited state */
264 tp->rx_opt.ts_recent = 0;
265 tp->rx_opt.ts_recent_stamp = 0;
ee995283 266 if (likely(!tp->repair))
0f317464 267 WRITE_ONCE(tp->write_seq, 0);
1da177e4
LT
268 }
269
c720c7e8 270 inet->inet_dport = usin->sin_port;
d1e559d0 271 sk_daddr_set(sk, daddr);
1da177e4 272
d83d8461 273 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
274 if (inet_opt)
275 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 276
bee7ca9e 277 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
278
279 /* Socket identity is still unknown (sport may be zero).
280 * However we set state to SYN-SENT and not releasing socket
281 * lock select source port, enter ourselves into the hash tables and
282 * complete initialization after this.
283 */
284 tcp_set_state(sk, TCP_SYN_SENT);
1946e672 285 err = inet_hash_connect(tcp_death_row, sk);
1da177e4
LT
286 if (err)
287 goto failure;
288
877d1f62 289 sk_set_txhash(sk);
9e7ceb06 290
da905bd1 291 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
292 inet->inet_sport, inet->inet_dport, sk);
293 if (IS_ERR(rt)) {
294 err = PTR_ERR(rt);
295 rt = NULL;
1da177e4 296 goto failure;
b23dd4fe 297 }
1da177e4 298 /* OK, now commit destination to socket. */
bcd76111 299 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 300 sk_setup_caps(sk, &rt->dst);
19f6d3f3 301 rt = NULL;
1da177e4 302
00355fa5 303 if (likely(!tp->repair)) {
00355fa5 304 if (!tp->write_seq)
0f317464
ED
305 WRITE_ONCE(tp->write_seq,
306 secure_tcp_seq(inet->inet_saddr,
307 inet->inet_daddr,
308 inet->inet_sport,
309 usin->sin_port));
08eaef90 310 tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr,
84b114b9 311 inet->inet_daddr);
00355fa5 312 }
1da177e4 313
7e3cf084 314 inet->inet_id = get_random_u16();
1da177e4 315
19f6d3f3
WW
316 if (tcp_fastopen_defer_connect(sk, &err))
317 return err;
318 if (err)
319 goto failure;
320
2b916477 321 err = tcp_connect(sk);
ee995283 322
1da177e4
LT
323 if (err)
324 goto failure;
325
326 return 0;
327
328failure:
7174259e
ACM
329 /*
330 * This unhashes the socket and releases the local port,
331 * if necessary.
332 */
1da177e4 333 tcp_set_state(sk, TCP_CLOSE);
e0833d1f 334 inet_bhash2_reset_saddr(sk);
1da177e4
LT
335 ip_rt_put(rt);
336 sk->sk_route_caps = 0;
c720c7e8 337 inet->inet_dport = 0;
1da177e4
LT
338 return err;
339}
4bc2f18b 340EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 341
1da177e4 342/*
563d34d0
ED
343 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
344 * It can be called through tcp_release_cb() if socket was owned by user
345 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 346 */
4fab9071 347void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4 348{
1da177e4 349 struct inet_sock *inet = inet_sk(sk);
02b2faaf
ED
350 struct dst_entry *dst;
351 u32 mtu;
1da177e4 352
02b2faaf
ED
353 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
354 return;
561022ac 355 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
80d0a69f
DM
356 dst = inet_csk_update_pmtu(sk, mtu);
357 if (!dst)
1da177e4
LT
358 return;
359
1da177e4
LT
360 /* Something is about to be wrong... Remember soft error
361 * for the case, if this connection will not able to recover.
362 */
363 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
364 sk->sk_err_soft = EMSGSIZE;
365
366 mtu = dst_mtu(dst);
367
368 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
482fc609 369 ip_sk_accept_pmtu(sk) &&
d83d8461 370 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
371 tcp_sync_mss(sk, mtu);
372
373 /* Resend the TCP packet because it's
374 * clear that the old packet has been
375 * dropped. This is the new "fast" path mtu
376 * discovery.
377 */
378 tcp_simple_retransmit(sk);
379 } /* else let the usual retransmit timer handle it */
380}
4fab9071 381EXPORT_SYMBOL(tcp_v4_mtu_reduced);
1da177e4 382
55be7a9c
DM
383static void do_redirect(struct sk_buff *skb, struct sock *sk)
384{
385 struct dst_entry *dst = __sk_dst_check(sk, 0);
386
1ed5c48f 387 if (dst)
6700c270 388 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
389}
390
26e37360
ED
391
392/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
9cf74903 393void tcp_req_err(struct sock *sk, u32 seq, bool abort)
26e37360
ED
394{
395 struct request_sock *req = inet_reqsk(sk);
396 struct net *net = sock_net(sk);
397
398 /* ICMPs are not backlogged, hence we cannot get
399 * an established socket here.
400 */
26e37360 401 if (seq != tcp_rsk(req)->snt_isn) {
02a1d6e7 402 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
9cf74903 403 } else if (abort) {
26e37360
ED
404 /*
405 * Still in SYN_RECV, just remove it silently.
406 * There is no good way to pass the error to the newly
407 * created socket, and POSIX does not want network
408 * errors returned from accept().
409 */
c6973669 410 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
9caad864 411 tcp_listendrop(req->rsk_listener);
26e37360 412 }
ef84d8ce 413 reqsk_put(req);
26e37360
ED
414}
415EXPORT_SYMBOL(tcp_req_err);
416
f7456642 417/* TCP-LD (RFC 6069) logic */
d2924569 418void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
f7456642
ED
419{
420 struct inet_connection_sock *icsk = inet_csk(sk);
421 struct tcp_sock *tp = tcp_sk(sk);
422 struct sk_buff *skb;
423 s32 remaining;
424 u32 delta_us;
425
426 if (sock_owned_by_user(sk))
427 return;
428
429 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
430 !icsk->icsk_backoff)
431 return;
432
433 skb = tcp_rtx_queue_head(sk);
434 if (WARN_ON_ONCE(!skb))
435 return;
436
437 icsk->icsk_backoff--;
438 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
439 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
440
441 tcp_mstamp_refresh(tp);
442 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
443 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
444
445 if (remaining > 0) {
446 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
447 remaining, TCP_RTO_MAX);
448 } else {
449 /* RTO revert clocked out retransmission.
450 * Will retransmit now.
451 */
452 tcp_retransmit_timer(sk);
453 }
454}
d2924569 455EXPORT_SYMBOL(tcp_ld_RTO_revert);
f7456642 456
1da177e4
LT
457/*
458 * This routine is called by the ICMP module when it gets some
459 * sort of error condition. If err < 0 then the socket should
460 * be closed and the error returned to the user. If err > 0
461 * it's just the icmp type << 8 | icmp code. After adjustment
462 * header points to the first 8 bytes of the tcp header. We need
463 * to find the appropriate port.
464 *
465 * The locking strategy used here is very "optimistic". When
466 * someone else accesses the socket the ICMP is just dropped
467 * and for some paths there is no check at all.
468 * A more general error queue to queue errors for later handling
469 * is probably better.
470 *
471 */
472
a12daf13 473int tcp_v4_err(struct sk_buff *skb, u32 info)
1da177e4 474{
a12daf13
ED
475 const struct iphdr *iph = (const struct iphdr *)skb->data;
476 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
1da177e4
LT
477 struct tcp_sock *tp;
478 struct inet_sock *inet;
a12daf13
ED
479 const int type = icmp_hdr(skb)->type;
480 const int code = icmp_hdr(skb)->code;
1da177e4 481 struct sock *sk;
0a672f74 482 struct request_sock *fastopen;
9a568de4 483 u32 seq, snd_una;
1da177e4 484 int err;
a12daf13 485 struct net *net = dev_net(skb->dev);
1da177e4 486
4461568a
KI
487 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
488 iph->daddr, th->dest, iph->saddr,
489 ntohs(th->source), inet_iif(skb), 0);
1da177e4 490 if (!sk) {
5d3848bc 491 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
32bbd879 492 return -ENOENT;
1da177e4
LT
493 }
494 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 495 inet_twsk_put(inet_twsk(sk));
32bbd879 496 return 0;
1da177e4 497 }
26e37360 498 seq = ntohl(th->seq);
32bbd879
SB
499 if (sk->sk_state == TCP_NEW_SYN_RECV) {
500 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
501 type == ICMP_TIME_EXCEEDED ||
502 (type == ICMP_DEST_UNREACH &&
503 (code == ICMP_NET_UNREACH ||
504 code == ICMP_HOST_UNREACH)));
505 return 0;
506 }
1da177e4
LT
507
508 bh_lock_sock(sk);
509 /* If too many ICMPs get dropped on busy
510 * servers this needs to be solved differently.
563d34d0
ED
511 * We do take care of PMTU discovery (RFC1191) special case :
512 * we can receive locally generated ICMP messages while socket is held.
1da177e4 513 */
b74aa930
ED
514 if (sock_owned_by_user(sk)) {
515 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
02a1d6e7 516 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
b74aa930 517 }
1da177e4
LT
518 if (sk->sk_state == TCP_CLOSE)
519 goto out;
520
020e71a3
ED
521 if (static_branch_unlikely(&ip4_min_ttl)) {
522 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
523 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
524 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
525 goto out;
526 }
97e3ecd1 527 }
528
1da177e4 529 tp = tcp_sk(sk);
0a672f74 530 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
d983ea6f 531 fastopen = rcu_dereference(tp->fastopen_rsk);
0a672f74 532 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
1da177e4 533 if (sk->sk_state != TCP_LISTEN &&
0a672f74 534 !between(seq, snd_una, tp->snd_nxt)) {
02a1d6e7 535 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
536 goto out;
537 }
538
539 switch (type) {
55be7a9c 540 case ICMP_REDIRECT:
45caeaa5 541 if (!sock_owned_by_user(sk))
a12daf13 542 do_redirect(skb, sk);
55be7a9c 543 goto out;
1da177e4
LT
544 case ICMP_SOURCE_QUENCH:
545 /* Just silently ignore these. */
546 goto out;
547 case ICMP_PARAMETERPROB:
548 err = EPROTO;
549 break;
550 case ICMP_DEST_UNREACH:
551 if (code > NR_ICMP_UNREACH)
552 goto out;
553
554 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
555 /* We are not interested in TCP_LISTEN and open_requests
556 * (SYN-ACKs send out by Linux are always <576bytes so
557 * they should go through unfragmented).
558 */
559 if (sk->sk_state == TCP_LISTEN)
560 goto out;
561
561022ac 562 WRITE_ONCE(tp->mtu_info, info);
144d56e9 563 if (!sock_owned_by_user(sk)) {
563d34d0 564 tcp_v4_mtu_reduced(sk);
144d56e9 565 } else {
7aa5470c 566 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
144d56e9
ED
567 sock_hold(sk);
568 }
1da177e4
LT
569 goto out;
570 }
571
572 err = icmp_err_convert[code].errno;
f7456642
ED
573 /* check if this ICMP message allows revert of backoff.
574 * (see RFC 6069)
575 */
576 if (!fastopen &&
577 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
578 tcp_ld_RTO_revert(sk, seq);
1da177e4
LT
579 break;
580 case ICMP_TIME_EXCEEDED:
581 err = EHOSTUNREACH;
582 break;
583 default:
584 goto out;
585 }
586
587 switch (sk->sk_state) {
1da177e4 588 case TCP_SYN_SENT:
0a672f74
YC
589 case TCP_SYN_RECV:
590 /* Only in fast or simultaneous open. If a fast open socket is
2bdcc73c 591 * already accepted it is treated as a connected one below.
0a672f74 592 */
51456b29 593 if (fastopen && !fastopen->sk)
0a672f74
YC
594 break;
595
a12daf13 596 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
45af29ca 597
1da177e4 598 if (!sock_owned_by_user(sk)) {
1da177e4
LT
599 sk->sk_err = err;
600
e3ae2365 601 sk_error_report(sk);
1da177e4
LT
602
603 tcp_done(sk);
604 } else {
605 sk->sk_err_soft = err;
606 }
607 goto out;
608 }
609
610 /* If we've already connected we will keep trying
611 * until we time out, or the user gives up.
612 *
613 * rfc1122 4.2.3.9 allows to consider as hard errors
614 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
615 * but it is obsoleted by pmtu discovery).
616 *
617 * Note, that in modern internet, where routing is unreliable
618 * and in each dark corner broken firewalls sit, sending random
619 * errors ordered by their masters even this two messages finally lose
620 * their original sense (even Linux sends invalid PORT_UNREACHs)
621 *
622 * Now we are in compliance with RFCs.
623 * --ANK (980905)
624 */
625
626 inet = inet_sk(sk);
627 if (!sock_owned_by_user(sk) && inet->recverr) {
628 sk->sk_err = err;
e3ae2365 629 sk_error_report(sk);
1da177e4
LT
630 } else { /* Only an error on timeout */
631 sk->sk_err_soft = err;
632 }
633
634out:
635 bh_unlock_sock(sk);
636 sock_put(sk);
32bbd879 637 return 0;
1da177e4
LT
638}
639
28850dc7 640void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 641{
aa8223c7 642 struct tcphdr *th = tcp_hdr(skb);
1da177e4 643
98be9b12
ED
644 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
645 skb->csum_start = skb_transport_header(skb) - skb->head;
646 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4
LT
647}
648
419f9f89 649/* This routine computes an IPv4 TCP checksum. */
bb296246 650void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 651{
cf533ea5 652 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
653
654 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
655}
4bc2f18b 656EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 657
1da177e4
LT
658/*
659 * This routine will send an RST to the other tcp.
660 *
661 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
662 * for reset.
663 * Answer: if a packet caused RST, it is not for a socket
664 * existing in our system, if it is matched to a socket,
665 * it is just duplicate segment or bug in other side's TCP.
666 * So that we build reply only basing on parameters
667 * arrived with segment.
668 * Exception: precedence violation. We do not implement it in any case.
669 */
670
dc87efdb
FW
671#ifdef CONFIG_TCP_MD5SIG
672#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
673#else
674#define OPTION_BYTES sizeof(__be32)
675#endif
676
a00e7444 677static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
1da177e4 678{
cf533ea5 679 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
680 struct {
681 struct tcphdr th;
dc87efdb 682 __be32 opt[OPTION_BYTES / sizeof(__be32)];
cfb6eeb4 683 } rep;
1da177e4 684 struct ip_reply_arg arg;
cfb6eeb4 685#ifdef CONFIG_TCP_MD5SIG
e46787f0 686 struct tcp_md5sig_key *key = NULL;
658ddaaf
SL
687 const __u8 *hash_location = NULL;
688 unsigned char newhash[16];
689 int genhash;
690 struct sock *sk1 = NULL;
cfb6eeb4 691#endif
d6fb396c 692 u64 transmit_time = 0;
00483690 693 struct sock *ctl_sk;
d6fb396c 694 struct net *net;
1da177e4
LT
695
696 /* Never send a reset in response to a reset. */
697 if (th->rst)
698 return;
699
c3658e8d
ED
700 /* If sk not NULL, it means we did a successful lookup and incoming
701 * route had to be correct. prequeue might have dropped our dst.
702 */
703 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
704 return;
705
706 /* Swap the send and the receive. */
cfb6eeb4
YH
707 memset(&rep, 0, sizeof(rep));
708 rep.th.dest = th->source;
709 rep.th.source = th->dest;
710 rep.th.doff = sizeof(struct tcphdr) / 4;
711 rep.th.rst = 1;
1da177e4
LT
712
713 if (th->ack) {
cfb6eeb4 714 rep.th.seq = th->ack_seq;
1da177e4 715 } else {
cfb6eeb4
YH
716 rep.th.ack = 1;
717 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
718 skb->len - (th->doff << 2));
1da177e4
LT
719 }
720
7174259e 721 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
722 arg.iov[0].iov_base = (unsigned char *)&rep;
723 arg.iov[0].iov_len = sizeof(rep.th);
724
0f85feae 725 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
cfb6eeb4 726#ifdef CONFIG_TCP_MD5SIG
3b24d854 727 rcu_read_lock();
658ddaaf 728 hash_location = tcp_parse_md5sig_option(th);
271c3b9b 729 if (sk && sk_fullsock(sk)) {
cea97609 730 const union tcp_md5_addr *addr;
dea53bb8 731 int l3index;
cea97609 732
dea53bb8
DA
733 /* sdif set, means packet ingressed via a device
734 * in an L3 domain and inet_iif is set to it.
735 */
736 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
cea97609 737 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
dea53bb8 738 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
e46787f0 739 } else if (hash_location) {
cea97609 740 const union tcp_md5_addr *addr;
534322ca
DA
741 int sdif = tcp_v4_sdif(skb);
742 int dif = inet_iif(skb);
dea53bb8 743 int l3index;
cea97609 744
658ddaaf
SL
745 /*
746 * active side is lost. Try to find listening socket through
747 * source port, and then find md5 key through listening socket.
748 * we are not loose security here:
749 * Incoming packet is checked with md5 hash with finding key,
750 * no RST generated if md5 hash doesn't match.
751 */
4461568a
KI
752 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
753 NULL, 0, ip_hdr(skb)->saddr,
da5e3630 754 th->source, ip_hdr(skb)->daddr,
534322ca 755 ntohs(th->source), dif, sdif);
658ddaaf
SL
756 /* don't send rst if it can't find key */
757 if (!sk1)
3b24d854
ED
758 goto out;
759
dea53bb8
DA
760 /* sdif set, means packet ingressed via a device
761 * in an L3 domain and dif is set to it.
762 */
763 l3index = sdif ? dif : 0;
cea97609 764 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
dea53bb8 765 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
658ddaaf 766 if (!key)
3b24d854
ED
767 goto out;
768
658ddaaf 769
39f8e58e 770 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
658ddaaf 771 if (genhash || memcmp(hash_location, newhash, 16) != 0)
3b24d854
ED
772 goto out;
773
658ddaaf
SL
774 }
775
cfb6eeb4
YH
776 if (key) {
777 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
778 (TCPOPT_NOP << 16) |
779 (TCPOPT_MD5SIG << 8) |
780 TCPOLEN_MD5SIG);
781 /* Update length and the length the header thinks exists */
782 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
783 rep.th.doff = arg.iov[0].iov_len / 4;
784
49a72dfb 785 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
786 key, ip_hdr(skb)->saddr,
787 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
788 }
789#endif
dc87efdb
FW
790 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
791 if (rep.opt[0] == 0) {
792 __be32 mrst = mptcp_reset_option(skb);
793
794 if (mrst) {
795 rep.opt[0] = mrst;
796 arg.iov[0].iov_len += sizeof(mrst);
797 rep.th.doff = arg.iov[0].iov_len / 4;
798 }
799 }
800
eddc9ec5
ACM
801 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
802 ip_hdr(skb)->saddr, /* XXX */
52cd5750 803 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 804 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
271c3b9b
FW
805 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
806
e2446eaa 807 /* When socket is gone, all binding information is lost.
4c675258
AK
808 * routing might fail in this case. No choice here, if we choose to force
809 * input interface, we will misroute in case of asymmetric route.
e2446eaa 810 */
c24b14c4 811 if (sk) {
4c675258 812 arg.bound_dev_if = sk->sk_bound_dev_if;
5c487bb9
SL
813 if (sk_fullsock(sk))
814 trace_tcp_send_reset(sk, skb);
c24b14c4 815 }
1da177e4 816
271c3b9b
FW
817 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
818 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
819
66b13d99 820 arg.tos = ip_hdr(skb)->tos;
e2d118a1 821 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
47dcc20a 822 local_bh_disable();
37ba017d
ED
823 ctl_sk = this_cpu_read(ipv4_tcp_sk);
824 sock_net_set(ctl_sk, net);
a842fe14 825 if (sk) {
00483690
JM
826 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
827 inet_twsk(sk)->tw_mark : sk->sk_mark;
f6c0f5d2
ED
828 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
829 inet_twsk(sk)->tw_priority : sk->sk_priority;
d6fb396c 830 transmit_time = tcp_transmit_time(sk);
e22aa148 831 xfrm_sk_clone_policy(ctl_sk, sk);
a842fe14 832 }
00483690 833 ip_send_unicast_reply(ctl_sk,
bdbbb852 834 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d 835 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
d6fb396c
ED
836 &arg, arg.iov[0].iov_len,
837 transmit_time);
1da177e4 838
00483690 839 ctl_sk->sk_mark = 0;
e22aa148 840 xfrm_sk_free_policy(ctl_sk);
37ba017d 841 sock_net_set(ctl_sk, &init_net);
90bbcc60
ED
842 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
843 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
47dcc20a 844 local_bh_enable();
658ddaaf
SL
845
846#ifdef CONFIG_TCP_MD5SIG
3b24d854
ED
847out:
848 rcu_read_unlock();
658ddaaf 849#endif
1da177e4
LT
850}
851
852/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
853 outside socket context is ugly, certainly. What can I do?
854 */
855
e2d118a1 856static void tcp_v4_send_ack(const struct sock *sk,
e62a123b 857 struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 858 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 859 struct tcp_md5sig_key *key,
66b13d99 860 int reply_flags, u8 tos)
1da177e4 861{
cf533ea5 862 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
863 struct {
864 struct tcphdr th;
714e85be 865 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 866#ifdef CONFIG_TCP_MD5SIG
714e85be 867 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
868#endif
869 ];
1da177e4 870 } rep;
e2d118a1 871 struct net *net = sock_net(sk);
1da177e4 872 struct ip_reply_arg arg;
00483690 873 struct sock *ctl_sk;
d6fb396c 874 u64 transmit_time;
1da177e4
LT
875
876 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 877 memset(&arg, 0, sizeof(arg));
1da177e4
LT
878
879 arg.iov[0].iov_base = (unsigned char *)&rep;
880 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 881 if (tsecr) {
cfb6eeb4
YH
882 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
883 (TCPOPT_TIMESTAMP << 8) |
884 TCPOLEN_TIMESTAMP);
ee684b6f
AV
885 rep.opt[1] = htonl(tsval);
886 rep.opt[2] = htonl(tsecr);
cb48cfe8 887 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
888 }
889
890 /* Swap the send and the receive. */
891 rep.th.dest = th->source;
892 rep.th.source = th->dest;
893 rep.th.doff = arg.iov[0].iov_len / 4;
894 rep.th.seq = htonl(seq);
895 rep.th.ack_seq = htonl(ack);
896 rep.th.ack = 1;
897 rep.th.window = htons(win);
898
cfb6eeb4 899#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 900 if (key) {
ee684b6f 901 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
902
903 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
904 (TCPOPT_NOP << 16) |
905 (TCPOPT_MD5SIG << 8) |
906 TCPOLEN_MD5SIG);
907 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
908 rep.th.doff = arg.iov[0].iov_len/4;
909
49a72dfb 910 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
911 key, ip_hdr(skb)->saddr,
912 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
913 }
914#endif
88ef4a5a 915 arg.flags = reply_flags;
eddc9ec5
ACM
916 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
917 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
918 arg.iov[0].iov_len, IPPROTO_TCP, 0);
919 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
920 if (oif)
921 arg.bound_dev_if = oif;
66b13d99 922 arg.tos = tos;
e2d118a1 923 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
47dcc20a 924 local_bh_disable();
37ba017d
ED
925 ctl_sk = this_cpu_read(ipv4_tcp_sk);
926 sock_net_set(ctl_sk, net);
a842fe14
ED
927 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
928 inet_twsk(sk)->tw_mark : sk->sk_mark;
f6c0f5d2
ED
929 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
930 inet_twsk(sk)->tw_priority : sk->sk_priority;
d6fb396c 931 transmit_time = tcp_transmit_time(sk);
00483690 932 ip_send_unicast_reply(ctl_sk,
bdbbb852 933 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d 934 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
d6fb396c
ED
935 &arg, arg.iov[0].iov_len,
936 transmit_time);
1da177e4 937
00483690 938 ctl_sk->sk_mark = 0;
37ba017d 939 sock_net_set(ctl_sk, &init_net);
90bbcc60 940 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
47dcc20a 941 local_bh_enable();
1da177e4
LT
942}
943
944static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
945{
8feaf0c0 946 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 947 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 948
e2d118a1 949 tcp_v4_send_ack(sk, skb,
e62a123b 950 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 951 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
9a568de4 952 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
9501f972
YH
953 tcptw->tw_ts_recent,
954 tw->tw_bound_dev_if,
88ef4a5a 955 tcp_twsk_md5_key(tcptw),
66b13d99
ED
956 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
957 tw->tw_tos
9501f972 958 );
1da177e4 959
8feaf0c0 960 inet_twsk_put(tw);
1da177e4
LT
961}
962
a00e7444 963static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
7174259e 964 struct request_sock *req)
1da177e4 965{
cea97609 966 const union tcp_md5_addr *addr;
dea53bb8 967 int l3index;
cea97609 968
168a8f58
JC
969 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
970 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
971 */
e62a123b
ED
972 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
973 tcp_sk(sk)->snd_nxt;
974
20a2b49f
ED
975 /* RFC 7323 2.3
976 * The window field (SEG.WND) of every outgoing segment, with the
977 * exception of <SYN> segments, MUST be right-shifted by
978 * Rcv.Wind.Shift bits:
979 */
cea97609 980 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
dea53bb8 981 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
e2d118a1 982 tcp_v4_send_ack(sk, skb, seq,
20a2b49f
ED
983 tcp_rsk(req)->rcv_nxt,
984 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
9a568de4 985 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
9501f972
YH
986 req->ts_recent,
987 0,
dea53bb8 988 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
66b13d99
ED
989 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
990 ip_hdr(skb)->tos);
1da177e4
LT
991}
992
1da177e4 993/*
9bf1d83e 994 * Send a SYN-ACK after having received a SYN.
60236fdd 995 * This still operates on a request_sock only, not on a big
1da177e4
LT
996 * socket.
997 */
0f935dbe 998static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
d6274bd8 999 struct flowi *fl,
72659ecc 1000 struct request_sock *req,
ca6fb065 1001 struct tcp_fastopen_cookie *foc,
331fca43
MKL
1002 enum tcp_synack_type synack_type,
1003 struct sk_buff *syn_skb)
1da177e4 1004{
2e6599cb 1005 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 1006 struct flowi4 fl4;
1da177e4 1007 int err = -1;
d41db5af 1008 struct sk_buff *skb;
ac8f1710 1009 u8 tos;
1da177e4
LT
1010
1011 /* First, grab a route. */
ba3f7f04 1012 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 1013 return -1;
1da177e4 1014
331fca43 1015 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1da177e4
LT
1016
1017 if (skb) {
634fb979 1018 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1da177e4 1019
870e3a63 1020 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
8ef44b6f
WW
1021 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1022 (inet_sk(sk)->tos & INET_ECN_MASK) :
407c85c7
AD
1023 inet_sk(sk)->tos;
1024
1025 if (!INET_ECN_is_capable(tos) &&
1026 tcp_bpf_ca_needs_ecn((struct sock *)req))
1027 tos |= INET_ECN_ECT_0;
1028
2ab2ddd3 1029 rcu_read_lock();
634fb979
ED
1030 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1031 ireq->ir_rmt_addr,
de033b7d 1032 rcu_dereference(ireq->ireq_opt),
861602b5 1033 tos);
2ab2ddd3 1034 rcu_read_unlock();
b9df3cb8 1035 err = net_xmit_eval(err);
1da177e4
LT
1036 }
1037
1da177e4
LT
1038 return err;
1039}
1040
1041/*
60236fdd 1042 * IPv4 request_sock destructor.
1da177e4 1043 */
60236fdd 1044static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 1045{
c92e8c02 1046 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1da177e4
LT
1047}
1048
cfb6eeb4
YH
1049#ifdef CONFIG_TCP_MD5SIG
1050/*
1051 * RFC2385 MD5 checksumming requires a mapping of
1052 * IP address->MD5 Key.
1053 * We need to maintain these in the sk structure.
1054 */
1055
459837b5 1056DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
6015c71e
ED
1057EXPORT_SYMBOL(tcp_md5_needed);
1058
86f1e3a8
LC
1059static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1060{
1061 if (!old)
1062 return true;
1063
1064 /* l3index always overrides non-l3index */
1065 if (old->l3index && new->l3index == 0)
1066 return false;
1067 if (old->l3index == 0 && new->l3index)
1068 return true;
1069
1070 return old->prefixlen < new->prefixlen;
1071}
1072
cfb6eeb4 1073/* Find the Key structure for an address. */
dea53bb8 1074struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
6015c71e
ED
1075 const union tcp_md5_addr *addr,
1076 int family)
cfb6eeb4 1077{
fd3a154a 1078 const struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1079 struct tcp_md5sig_key *key;
fd3a154a 1080 const struct tcp_md5sig_info *md5sig;
6797318e
ID
1081 __be32 mask;
1082 struct tcp_md5sig_key *best_match = NULL;
1083 bool match;
cfb6eeb4 1084
a8afca03
ED
1085 /* caller either holds rcu_read_lock() or socket lock */
1086 md5sig = rcu_dereference_check(tp->md5sig_info,
1e1d04e6 1087 lockdep_sock_is_held(sk));
a8afca03 1088 if (!md5sig)
cfb6eeb4 1089 return NULL;
083a0326 1090
c8b91770
AG
1091 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1092 lockdep_sock_is_held(sk)) {
a915da9b
ED
1093 if (key->family != family)
1094 continue;
a76c2315 1095 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
dea53bb8 1096 continue;
6797318e
ID
1097 if (family == AF_INET) {
1098 mask = inet_make_mask(key->prefixlen);
1099 match = (key->addr.a4.s_addr & mask) ==
1100 (addr->a4.s_addr & mask);
1101#if IS_ENABLED(CONFIG_IPV6)
1102 } else if (family == AF_INET6) {
1103 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1104 key->prefixlen);
1105#endif
1106 } else {
1107 match = false;
1108 }
1109
86f1e3a8 1110 if (match && better_md5_match(best_match, key))
6797318e
ID
1111 best_match = key;
1112 }
1113 return best_match;
1114}
6015c71e 1115EXPORT_SYMBOL(__tcp_md5_do_lookup);
6797318e 1116
e8f37d57
WF
1117static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1118 const union tcp_md5_addr *addr,
dea53bb8 1119 int family, u8 prefixlen,
a76c2315 1120 int l3index, u8 flags)
6797318e
ID
1121{
1122 const struct tcp_sock *tp = tcp_sk(sk);
1123 struct tcp_md5sig_key *key;
1124 unsigned int size = sizeof(struct in_addr);
1125 const struct tcp_md5sig_info *md5sig;
1126
1127 /* caller either holds rcu_read_lock() or socket lock */
1128 md5sig = rcu_dereference_check(tp->md5sig_info,
1129 lockdep_sock_is_held(sk));
1130 if (!md5sig)
1131 return NULL;
1132#if IS_ENABLED(CONFIG_IPV6)
1133 if (family == AF_INET6)
1134 size = sizeof(struct in6_addr);
1135#endif
c8b91770
AG
1136 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1137 lockdep_sock_is_held(sk)) {
6797318e
ID
1138 if (key->family != family)
1139 continue;
a76c2315
LC
1140 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1141 continue;
86f1e3a8 1142 if (key->l3index != l3index)
dea53bb8 1143 continue;
6797318e
ID
1144 if (!memcmp(&key->addr, addr, size) &&
1145 key->prefixlen == prefixlen)
a915da9b 1146 return key;
cfb6eeb4
YH
1147 }
1148 return NULL;
1149}
1150
b83e3deb 1151struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
fd3a154a 1152 const struct sock *addr_sk)
cfb6eeb4 1153{
b52e6921 1154 const union tcp_md5_addr *addr;
dea53bb8 1155 int l3index;
a915da9b 1156
dea53bb8
DA
1157 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1158 addr_sk->sk_bound_dev_if);
b52e6921 1159 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
dea53bb8 1160 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
cfb6eeb4 1161}
cfb6eeb4
YH
1162EXPORT_SYMBOL(tcp_v4_md5_lookup);
1163
f62c7517
DS
1164static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1165{
1166 struct tcp_sock *tp = tcp_sk(sk);
1167 struct tcp_md5sig_info *md5sig;
1168
f62c7517
DS
1169 md5sig = kmalloc(sizeof(*md5sig), gfp);
1170 if (!md5sig)
1171 return -ENOMEM;
1172
1173 sk_gso_disable(sk);
1174 INIT_HLIST_HEAD(&md5sig->head);
1175 rcu_assign_pointer(tp->md5sig_info, md5sig);
1176 return 0;
1177}
1178
cfb6eeb4 1179/* This can be called on a newly created socket, from other files */
459837b5
DS
1180static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1181 int family, u8 prefixlen, int l3index, u8 flags,
1182 const u8 *newkey, u8 newkeylen, gfp_t gfp)
cfb6eeb4
YH
1183{
1184 /* Add Key to the list */
b0a713e9 1185 struct tcp_md5sig_key *key;
cfb6eeb4 1186 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1187 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1188
a76c2315 1189 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
cfb6eeb4 1190 if (key) {
e6ced831
ED
1191 /* Pre-existing entry - just update that one.
1192 * Note that the key might be used concurrently.
1193 * data_race() is telling kcsan that we do not care of
1194 * key mismatches, since changing MD5 key on live flows
1195 * can lead to packet drops.
1196 */
1197 data_race(memcpy(key->key, newkey, newkeylen));
6a2febec 1198
e6ced831
ED
1199 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1200 * Also note that a reader could catch new key->keylen value
1201 * but old key->key[], this is the reason we use __GFP_ZERO
1202 * at sock_kmalloc() time below these lines.
1203 */
1204 WRITE_ONCE(key->keylen, newkeylen);
6a2febec 1205
a915da9b
ED
1206 return 0;
1207 }
260fcbeb 1208
a8afca03 1209 md5sig = rcu_dereference_protected(tp->md5sig_info,
1e1d04e6 1210 lockdep_sock_is_held(sk));
cfb6eeb4 1211
e6ced831 1212 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
a915da9b
ED
1213 if (!key)
1214 return -ENOMEM;
71cea17e 1215 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 1216 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 1217 return -ENOMEM;
cfb6eeb4 1218 }
a915da9b
ED
1219
1220 memcpy(key->key, newkey, newkeylen);
1221 key->keylen = newkeylen;
1222 key->family = family;
6797318e 1223 key->prefixlen = prefixlen;
dea53bb8 1224 key->l3index = l3index;
a76c2315 1225 key->flags = flags;
a915da9b 1226 memcpy(&key->addr, addr,
3a2cd89b 1227 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1228 sizeof(struct in_addr));
a915da9b 1229 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
1230 return 0;
1231}
459837b5
DS
1232
1233int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1234 int family, u8 prefixlen, int l3index, u8 flags,
1235 const u8 *newkey, u8 newkeylen)
1236{
1237 struct tcp_sock *tp = tcp_sk(sk);
1238
1239 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1240 if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1241 return -ENOMEM;
1242
1243 if (!static_branch_inc(&tcp_md5_needed.key)) {
1244 struct tcp_md5sig_info *md5sig;
1245
1246 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1247 rcu_assign_pointer(tp->md5sig_info, NULL);
55fb80d5 1248 kfree_rcu(md5sig, rcu);
459837b5
DS
1249 return -EUSERS;
1250 }
1251 }
1252
1253 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1254 newkey, newkeylen, GFP_KERNEL);
1255}
a915da9b 1256EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 1257
459837b5
DS
1258int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1259 int family, u8 prefixlen, int l3index,
1260 struct tcp_md5sig_key *key)
1261{
1262 struct tcp_sock *tp = tcp_sk(sk);
1263
1264 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1265 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1266 return -ENOMEM;
1267
1268 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1269 struct tcp_md5sig_info *md5sig;
1270
1271 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1272 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1273 rcu_assign_pointer(tp->md5sig_info, NULL);
55fb80d5 1274 kfree_rcu(md5sig, rcu);
459837b5
DS
1275 return -EUSERS;
1276 }
1277 }
1278
1279 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1280 key->flags, key->key, key->keylen,
1281 sk_gfp_mask(sk, GFP_ATOMIC));
1282}
1283EXPORT_SYMBOL(tcp_md5_key_copy);
1284
6797318e 1285int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
a76c2315 1286 u8 prefixlen, int l3index, u8 flags)
cfb6eeb4 1287{
a915da9b
ED
1288 struct tcp_md5sig_key *key;
1289
a76c2315 1290 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
a915da9b
ED
1291 if (!key)
1292 return -ENOENT;
1293 hlist_del_rcu(&key->node);
5f3d9cb2 1294 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1295 kfree_rcu(key, rcu);
a915da9b 1296 return 0;
cfb6eeb4 1297}
a915da9b 1298EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 1299
e0683e70 1300static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
1301{
1302 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1303 struct tcp_md5sig_key *key;
b67bfe0d 1304 struct hlist_node *n;
a8afca03 1305 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1306
a8afca03
ED
1307 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1308
b67bfe0d 1309 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1310 hlist_del_rcu(&key->node);
5f3d9cb2 1311 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1312 kfree_rcu(key, rcu);
cfb6eeb4
YH
1313 }
1314}
1315
8917a777 1316static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
d4c19c49 1317 sockptr_t optval, int optlen)
cfb6eeb4
YH
1318{
1319 struct tcp_md5sig cmd;
1320 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
cea97609 1321 const union tcp_md5_addr *addr;
8917a777 1322 u8 prefixlen = 32;
dea53bb8 1323 int l3index = 0;
a76c2315 1324 u8 flags;
cfb6eeb4
YH
1325
1326 if (optlen < sizeof(cmd))
1327 return -EINVAL;
1328
d4c19c49 1329 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1330 return -EFAULT;
1331
1332 if (sin->sin_family != AF_INET)
1333 return -EINVAL;
1334
a76c2315
LC
1335 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1336
8917a777
ID
1337 if (optname == TCP_MD5SIG_EXT &&
1338 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1339 prefixlen = cmd.tcpm_prefixlen;
1340 if (prefixlen > 32)
1341 return -EINVAL;
1342 }
1343
a76c2315 1344 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
6b102db5
DA
1345 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1346 struct net_device *dev;
1347
1348 rcu_read_lock();
1349 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1350 if (dev && netif_is_l3_master(dev))
1351 l3index = dev->ifindex;
1352
1353 rcu_read_unlock();
1354
1355 /* ok to reference set/not set outside of rcu;
1356 * right now device MUST be an L3 master
1357 */
1358 if (!dev || !l3index)
1359 return -EINVAL;
1360 }
1361
cea97609
DA
1362 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1363
64a124ed 1364 if (!cmd.tcpm_keylen)
a76c2315 1365 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
cfb6eeb4
YH
1366
1367 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1368 return -EINVAL;
1369
a76c2315 1370 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
459837b5 1371 cmd.tcpm_key, cmd.tcpm_keylen);
cfb6eeb4
YH
1372}
1373
19689e38
ED
1374static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1375 __be32 daddr, __be32 saddr,
1376 const struct tcphdr *th, int nbytes)
cfb6eeb4 1377{
cfb6eeb4 1378 struct tcp4_pseudohdr *bp;
49a72dfb 1379 struct scatterlist sg;
19689e38 1380 struct tcphdr *_th;
cfb6eeb4 1381
19689e38 1382 bp = hp->scratch;
cfb6eeb4
YH
1383 bp->saddr = saddr;
1384 bp->daddr = daddr;
1385 bp->pad = 0;
076fb722 1386 bp->protocol = IPPROTO_TCP;
49a72dfb 1387 bp->len = cpu_to_be16(nbytes);
c7da57a1 1388
19689e38
ED
1389 _th = (struct tcphdr *)(bp + 1);
1390 memcpy(_th, th, sizeof(*th));
1391 _th->check = 0;
1392
1393 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1394 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1395 sizeof(*bp) + sizeof(*th));
cf80e0e4 1396 return crypto_ahash_update(hp->md5_req);
49a72dfb
AL
1397}
1398
a915da9b 1399static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1400 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1401{
1402 struct tcp_md5sig_pool *hp;
cf80e0e4 1403 struct ahash_request *req;
49a72dfb
AL
1404
1405 hp = tcp_get_md5sig_pool();
1406 if (!hp)
1407 goto clear_hash_noput;
cf80e0e4 1408 req = hp->md5_req;
49a72dfb 1409
cf80e0e4 1410 if (crypto_ahash_init(req))
49a72dfb 1411 goto clear_hash;
19689e38 1412 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
49a72dfb
AL
1413 goto clear_hash;
1414 if (tcp_md5_hash_key(hp, key))
1415 goto clear_hash;
cf80e0e4
HX
1416 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1417 if (crypto_ahash_final(req))
cfb6eeb4
YH
1418 goto clear_hash;
1419
cfb6eeb4 1420 tcp_put_md5sig_pool();
cfb6eeb4 1421 return 0;
49a72dfb 1422
cfb6eeb4
YH
1423clear_hash:
1424 tcp_put_md5sig_pool();
1425clear_hash_noput:
1426 memset(md5_hash, 0, 16);
49a72dfb 1427 return 1;
cfb6eeb4
YH
1428}
1429
39f8e58e
ED
1430int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1431 const struct sock *sk,
318cf7aa 1432 const struct sk_buff *skb)
cfb6eeb4 1433{
49a72dfb 1434 struct tcp_md5sig_pool *hp;
cf80e0e4 1435 struct ahash_request *req;
318cf7aa 1436 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1437 __be32 saddr, daddr;
1438
39f8e58e
ED
1439 if (sk) { /* valid for establish/request sockets */
1440 saddr = sk->sk_rcv_saddr;
1441 daddr = sk->sk_daddr;
cfb6eeb4 1442 } else {
49a72dfb
AL
1443 const struct iphdr *iph = ip_hdr(skb);
1444 saddr = iph->saddr;
1445 daddr = iph->daddr;
cfb6eeb4 1446 }
49a72dfb
AL
1447
1448 hp = tcp_get_md5sig_pool();
1449 if (!hp)
1450 goto clear_hash_noput;
cf80e0e4 1451 req = hp->md5_req;
49a72dfb 1452
cf80e0e4 1453 if (crypto_ahash_init(req))
49a72dfb
AL
1454 goto clear_hash;
1455
19689e38 1456 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
49a72dfb
AL
1457 goto clear_hash;
1458 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1459 goto clear_hash;
1460 if (tcp_md5_hash_key(hp, key))
1461 goto clear_hash;
cf80e0e4
HX
1462 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1463 if (crypto_ahash_final(req))
49a72dfb
AL
1464 goto clear_hash;
1465
1466 tcp_put_md5sig_pool();
1467 return 0;
1468
1469clear_hash:
1470 tcp_put_md5sig_pool();
1471clear_hash_noput:
1472 memset(md5_hash, 0, 16);
1473 return 1;
cfb6eeb4 1474}
49a72dfb 1475EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1476
ba8e275a
ED
1477#endif
1478
b40cf18e
ED
1479static void tcp_v4_init_req(struct request_sock *req,
1480 const struct sock *sk_listener,
16bea70a
OP
1481 struct sk_buff *skb)
1482{
1483 struct inet_request_sock *ireq = inet_rsk(req);
c92e8c02 1484 struct net *net = sock_net(sk_listener);
16bea70a 1485
08d2cc3b
ED
1486 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1487 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
c92e8c02 1488 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
16bea70a
OP
1489}
1490
f964629e 1491static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
7ea851d1 1492 struct sk_buff *skb,
f964629e 1493 struct flowi *fl,
7ea851d1 1494 struct request_sock *req)
d94e0417 1495{
7ea851d1
FW
1496 tcp_v4_init_req(req, sk, skb);
1497
1498 if (security_inet_conn_request(sk, skb, req))
1499 return NULL;
1500
4396e461 1501 return inet_csk_route_req(sk, &fl->u.ip4, req);
d94e0417
OP
1502}
1503
72a3effa 1504struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1505 .family = PF_INET,
2e6599cb 1506 .obj_size = sizeof(struct tcp_request_sock),
5db92c99 1507 .rtx_syn_ack = tcp_rtx_synack,
60236fdd
ACM
1508 .send_ack = tcp_v4_reqsk_send_ack,
1509 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1510 .send_reset = tcp_v4_send_reset,
688d1945 1511 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1512};
1513
35b2c321 1514const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
2aec4a29 1515 .mss_clamp = TCP_MSS_DEFAULT,
16bea70a 1516#ifdef CONFIG_TCP_MD5SIG
fd3a154a 1517 .req_md5_lookup = tcp_v4_md5_lookup,
e3afe7b7 1518 .calc_md5_hash = tcp_v4_md5_hash_skb,
b6332e6c 1519#endif
fb7b37a7
OP
1520#ifdef CONFIG_SYN_COOKIES
1521 .cookie_init_seq = cookie_v4_init_sequence,
1522#endif
d94e0417 1523 .route_req = tcp_v4_route_req,
84b114b9
ED
1524 .init_seq = tcp_v4_init_seq,
1525 .init_ts_off = tcp_v4_init_ts_off,
d6274bd8 1526 .send_synack = tcp_v4_send_synack,
16bea70a 1527};
cfb6eeb4 1528
1da177e4
LT
1529int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1530{
1da177e4 1531 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1532 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1533 goto drop;
1534
1fb6f159
OP
1535 return tcp_conn_request(&tcp_request_sock_ops,
1536 &tcp_request_sock_ipv4_ops, sk, skb);
1da177e4 1537
1da177e4 1538drop:
9caad864 1539 tcp_listendrop(sk);
1da177e4
LT
1540 return 0;
1541}
4bc2f18b 1542EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1543
1544
1545/*
1546 * The three way handshake has completed - we got a valid synack -
1547 * now create the new socket.
1548 */
0c27171e 1549struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
60236fdd 1550 struct request_sock *req,
5e0724d0
ED
1551 struct dst_entry *dst,
1552 struct request_sock *req_unhash,
1553 bool *own_req)
1da177e4 1554{
2e6599cb 1555 struct inet_request_sock *ireq;
01770a16 1556 bool found_dup_sk = false;
1da177e4
LT
1557 struct inet_sock *newinet;
1558 struct tcp_sock *newtp;
1559 struct sock *newsk;
cfb6eeb4 1560#ifdef CONFIG_TCP_MD5SIG
cea97609 1561 const union tcp_md5_addr *addr;
cfb6eeb4 1562 struct tcp_md5sig_key *key;
dea53bb8 1563 int l3index;
cfb6eeb4 1564#endif
f6d8bd05 1565 struct ip_options_rcu *inet_opt;
1da177e4
LT
1566
1567 if (sk_acceptq_is_full(sk))
1568 goto exit_overflow;
1569
1da177e4
LT
1570 newsk = tcp_create_openreq_child(sk, req, skb);
1571 if (!newsk)
093d2823 1572 goto exit_nonewsk;
1da177e4 1573
bcd76111 1574 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1575 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1576
1577 newtp = tcp_sk(newsk);
1578 newinet = inet_sk(newsk);
2e6599cb 1579 ireq = inet_rsk(req);
d1e559d0
ED
1580 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1581 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
6dd9a14e 1582 newsk->sk_bound_dev_if = ireq->ir_iif;
c92e8c02
ED
1583 newinet->inet_saddr = ireq->ir_loc_addr;
1584 inet_opt = rcu_dereference(ireq->ireq_opt);
1585 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
463c84b9 1586 newinet->mc_index = inet_iif(skb);
eddc9ec5 1587 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1588 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1589 inet_csk(newsk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
1590 if (inet_opt)
1591 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
7e3cf084 1592 newinet->inet_id = get_random_u16();
1da177e4 1593
8ef44b6f
WW
1594 /* Set ToS of the new socket based upon the value of incoming SYN.
1595 * ECT bits are set later in tcp_init_transfer().
1596 */
870e3a63 1597 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
ac8f1710
WW
1598 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1599
dfd25fff
ED
1600 if (!dst) {
1601 dst = inet_csk_route_child_sock(sk, newsk, req);
1602 if (!dst)
1603 goto put_and_exit;
1604 } else {
1605 /* syncookie case : see end of cookie_v4_check() */
1606 }
0e734419
DM
1607 sk_setup_caps(newsk, dst);
1608
81164413
DB
1609 tcp_ca_openreq_child(newsk, dst);
1610
1da177e4 1611 tcp_sync_mss(newsk, dst_mtu(dst));
3541f9e8 1612 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
f5fff5dc 1613
1da177e4
LT
1614 tcp_initialize_rcv_mss(newsk);
1615
cfb6eeb4 1616#ifdef CONFIG_TCP_MD5SIG
dea53bb8 1617 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
cfb6eeb4 1618 /* Copy over the MD5 key from the original socket */
cea97609 1619 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
dea53bb8 1620 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
00db4124 1621 if (key) {
b389d1af
DS
1622 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1623 goto put_and_exit;
aba54656 1624 sk_gso_disable(newsk);
cfb6eeb4
YH
1625 }
1626#endif
1627
0e734419
DM
1628 if (__inet_inherit_port(sk, newsk) < 0)
1629 goto put_and_exit;
01770a16
RD
1630 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1631 &found_dup_sk);
c92e8c02 1632 if (likely(*own_req)) {
49a496c9 1633 tcp_move_syn(newtp, req);
c92e8c02
ED
1634 ireq->ireq_opt = NULL;
1635 } else {
c89dffc7
KI
1636 newinet->inet_opt = NULL;
1637
01770a16
RD
1638 if (!req_unhash && found_dup_sk) {
1639 /* This code path should only be executed in the
1640 * syncookie case only
1641 */
1642 bh_unlock_sock(newsk);
1643 sock_put(newsk);
1644 newsk = NULL;
01770a16 1645 }
c92e8c02 1646 }
1da177e4
LT
1647 return newsk;
1648
1649exit_overflow:
c10d9310 1650 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1651exit_nonewsk:
1652 dst_release(dst);
1da177e4 1653exit:
9caad864 1654 tcp_listendrop(sk);
1da177e4 1655 return NULL;
0e734419 1656put_and_exit:
c92e8c02 1657 newinet->inet_opt = NULL;
e337e24d
CP
1658 inet_csk_prepare_forced_close(newsk);
1659 tcp_done(newsk);
0e734419 1660 goto exit;
1da177e4 1661}
4bc2f18b 1662EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4 1663
079096f1 1664static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1da177e4 1665{
079096f1 1666#ifdef CONFIG_SYN_COOKIES
52452c54 1667 const struct tcphdr *th = tcp_hdr(skb);
1da177e4 1668
af9b4738 1669 if (!th->syn)
461b74c3 1670 sk = cookie_v4_check(sk, skb);
1da177e4
LT
1671#endif
1672 return sk;
1673}
1674
9349d600
PP
1675u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1676 struct tcphdr *th, u32 *cookie)
1677{
1678 u16 mss = 0;
1679#ifdef CONFIG_SYN_COOKIES
1680 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1681 &tcp_request_sock_ipv4_ops, sk, th);
1682 if (mss) {
1683 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1684 tcp_synq_overflow(sk);
1685 }
1686#endif
1687 return mss;
1688}
1689
bbd807df
BV
1690INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1691 u32));
1da177e4 1692/* The socket must have it's spinlock held when we get
e994b2f0 1693 * here, unless it is a TCP_LISTEN socket.
1da177e4
LT
1694 *
1695 * We have a potential double-lock case here, so even when
1696 * doing backlog processing we use the BH locking scheme.
1697 * This is because we cannot sleep with the original spinlock
1698 * held.
1699 */
1700int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1701{
8eba65fa 1702 enum skb_drop_reason reason;
cfb6eeb4 1703 struct sock *rsk;
cfb6eeb4 1704
1da177e4 1705 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
8f905c0e
ED
1706 struct dst_entry *dst;
1707
1708 dst = rcu_dereference_protected(sk->sk_rx_dst,
1709 lockdep_sock_is_held(sk));
404e0a8b 1710
bdeab991 1711 sock_rps_save_rxhash(sk, skb);
3d97379a 1712 sk_mark_napi_id(sk, skb);
404e0a8b 1713 if (dst) {
0c0a5ef8 1714 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
bbd807df
BV
1715 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1716 dst, 0)) {
8f905c0e 1717 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
92101b3b 1718 dst_release(dst);
92101b3b
DM
1719 }
1720 }
3d97d88e 1721 tcp_rcv_established(sk, skb);
1da177e4
LT
1722 return 0;
1723 }
1724
8eba65fa 1725 reason = SKB_DROP_REASON_NOT_SPECIFIED;
12e25e10 1726 if (tcp_checksum_complete(skb))
1da177e4
LT
1727 goto csum_err;
1728
1729 if (sk->sk_state == TCP_LISTEN) {
079096f1
ED
1730 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1731
1da177e4
LT
1732 if (!nsk)
1733 goto discard;
1da177e4 1734 if (nsk != sk) {
cfb6eeb4
YH
1735 if (tcp_child_process(sk, nsk, skb)) {
1736 rsk = nsk;
1da177e4 1737 goto reset;
cfb6eeb4 1738 }
1da177e4
LT
1739 return 0;
1740 }
ca55158c 1741 } else
bdeab991 1742 sock_rps_save_rxhash(sk, skb);
ca55158c 1743
72ab4a86 1744 if (tcp_rcv_state_process(sk, skb)) {
cfb6eeb4 1745 rsk = sk;
1da177e4 1746 goto reset;
cfb6eeb4 1747 }
1da177e4
LT
1748 return 0;
1749
1750reset:
cfb6eeb4 1751 tcp_v4_send_reset(rsk, skb);
1da177e4 1752discard:
8eba65fa 1753 kfree_skb_reason(skb, reason);
1da177e4
LT
1754 /* Be careful here. If this function gets more complicated and
1755 * gcc suffers from register pressure on the x86, sk (in %ebx)
1756 * might be destroyed here. This current version compiles correctly,
1757 * but you have been warned.
1758 */
1759 return 0;
1760
1761csum_err:
8eba65fa 1762 reason = SKB_DROP_REASON_TCP_CSUM;
709c0314 1763 trace_tcp_bad_csum(skb);
c10d9310
ED
1764 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1765 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1766 goto discard;
1767}
4bc2f18b 1768EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1769
7487449c 1770int tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1771{
4461568a 1772 struct net *net = dev_net(skb->dev);
41063e9d
DM
1773 const struct iphdr *iph;
1774 const struct tcphdr *th;
1775 struct sock *sk;
41063e9d 1776
41063e9d 1777 if (skb->pkt_type != PACKET_HOST)
7487449c 1778 return 0;
41063e9d 1779
45f00f99 1780 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
7487449c 1781 return 0;
41063e9d
DM
1782
1783 iph = ip_hdr(skb);
45f00f99 1784 th = tcp_hdr(skb);
41063e9d
DM
1785
1786 if (th->doff < sizeof(struct tcphdr) / 4)
7487449c 1787 return 0;
41063e9d 1788
4461568a 1789 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
41063e9d 1790 iph->saddr, th->source,
7011d085 1791 iph->daddr, ntohs(th->dest),
3fa6f616 1792 skb->skb_iif, inet_sdif(skb));
41063e9d
DM
1793 if (sk) {
1794 skb->sk = sk;
1795 skb->destructor = sock_edemux;
f7e4eb03 1796 if (sk_fullsock(sk)) {
8f905c0e 1797 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
505fbcf0 1798
41063e9d
DM
1799 if (dst)
1800 dst = dst_check(dst, 0);
92101b3b 1801 if (dst &&
0c0a5ef8 1802 sk->sk_rx_dst_ifindex == skb->skb_iif)
92101b3b 1803 skb_dst_set_noref(skb, dst);
41063e9d
DM
1804 }
1805 }
7487449c 1806 return 0;
41063e9d
DM
1807}
1808
7a26dc9e
MD
1809bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1810 enum skb_drop_reason *reason)
c9c33212 1811{
d519f350 1812 u32 limit, tail_gso_size, tail_gso_segs;
4f693b55
ED
1813 struct skb_shared_info *shinfo;
1814 const struct tcphdr *th;
1815 struct tcphdr *thtail;
1816 struct sk_buff *tail;
1817 unsigned int hdrlen;
1818 bool fragstolen;
1819 u32 gso_segs;
b160c285 1820 u32 gso_size;
4f693b55 1821 int delta;
c9c33212
ED
1822
1823 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1824 * we can fix skb->truesize to its real value to avoid future drops.
1825 * This is valid because skb is not yet charged to the socket.
1826 * It has been noticed pure SACK packets were sometimes dropped
1827 * (if cooked by drivers without copybreak feature).
1828 */
60b1af33 1829 skb_condense(skb);
c9c33212 1830
ade9628e
ED
1831 skb_dst_drop(skb);
1832
4f693b55
ED
1833 if (unlikely(tcp_checksum_complete(skb))) {
1834 bh_unlock_sock(sk);
709c0314 1835 trace_tcp_bad_csum(skb);
7a26dc9e 1836 *reason = SKB_DROP_REASON_TCP_CSUM;
4f693b55
ED
1837 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1838 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1839 return true;
1840 }
1841
1842 /* Attempt coalescing to last skb in backlog, even if we are
1843 * above the limits.
1844 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1845 */
1846 th = (const struct tcphdr *)skb->data;
1847 hdrlen = th->doff * 4;
4f693b55
ED
1848
1849 tail = sk->sk_backlog.tail;
1850 if (!tail)
1851 goto no_coalesce;
1852 thtail = (struct tcphdr *)tail->data;
1853
1854 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1855 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1856 ((TCP_SKB_CB(tail)->tcp_flags |
ca2fe295
ED
1857 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1858 !((TCP_SKB_CB(tail)->tcp_flags &
1859 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
4f693b55
ED
1860 ((TCP_SKB_CB(tail)->tcp_flags ^
1861 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1862#ifdef CONFIG_TLS_DEVICE
1863 tail->decrypted != skb->decrypted ||
1864#endif
1865 thtail->doff != th->doff ||
1866 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1867 goto no_coalesce;
1868
1869 __skb_pull(skb, hdrlen);
b160c285
ED
1870
1871 shinfo = skb_shinfo(skb);
1872 gso_size = shinfo->gso_size ?: skb->len;
1873 gso_segs = shinfo->gso_segs ?: 1;
1874
1875 shinfo = skb_shinfo(tail);
1876 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1877 tail_gso_segs = shinfo->gso_segs ?: 1;
1878
4f693b55 1879 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
4f693b55
ED
1880 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1881
86bccd03 1882 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
4f693b55 1883 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
86bccd03
ED
1884 thtail->window = th->window;
1885 }
4f693b55 1886
ca2fe295
ED
1887 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1888 * thtail->fin, so that the fast path in tcp_rcv_established()
1889 * is not entered if we append a packet with a FIN.
1890 * SYN, RST, URG are not present.
1891 * ACK is set on both packets.
1892 * PSH : we do not really care in TCP stack,
1893 * at least for 'GRO' packets.
1894 */
1895 thtail->fin |= th->fin;
4f693b55
ED
1896 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1897
1898 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1899 TCP_SKB_CB(tail)->has_rxtstamp = true;
1900 tail->tstamp = skb->tstamp;
1901 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1902 }
1903
1904 /* Not as strict as GRO. We only need to carry mss max value */
b160c285
ED
1905 shinfo->gso_size = max(gso_size, tail_gso_size);
1906 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
4f693b55
ED
1907
1908 sk->sk_backlog.len += delta;
1909 __NET_INC_STATS(sock_net(sk),
1910 LINUX_MIB_TCPBACKLOGCOALESCE);
1911 kfree_skb_partial(skb, fragstolen);
1912 return false;
1913 }
1914 __skb_push(skb, hdrlen);
1915
1916no_coalesce:
ec791d81
LW
1917 limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1918
4f693b55
ED
1919 /* Only socket owner can try to collapse/prune rx queues
1920 * to reduce memory overhead, so add a little headroom here.
1921 * Few sockets backlog are possibly concurrently non empty.
1922 */
ec791d81 1923 limit += 64 * 1024;
4f693b55 1924
c9c33212
ED
1925 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1926 bh_unlock_sock(sk);
7a26dc9e 1927 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
c9c33212
ED
1928 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1929 return true;
1930 }
1931 return false;
1932}
1933EXPORT_SYMBOL(tcp_add_backlog);
1934
ac6e7800
ED
1935int tcp_filter(struct sock *sk, struct sk_buff *skb)
1936{
1937 struct tcphdr *th = (struct tcphdr *)skb->data;
ac6e7800 1938
f2feaefd 1939 return sk_filter_trim_cap(sk, skb, th->doff * 4);
ac6e7800
ED
1940}
1941EXPORT_SYMBOL(tcp_filter);
1942
eeea10b8
ED
1943static void tcp_v4_restore_cb(struct sk_buff *skb)
1944{
1945 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1946 sizeof(struct inet_skb_parm));
1947}
1948
1949static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1950 const struct tcphdr *th)
1951{
1952 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1953 * barrier() makes sure compiler wont play fool^Waliasing games.
1954 */
1955 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1956 sizeof(struct inet_skb_parm));
1957 barrier();
1958
1959 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1960 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1961 skb->len - th->doff * 4);
1962 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1963 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1964 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1965 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1966 TCP_SKB_CB(skb)->sacked = 0;
1967 TCP_SKB_CB(skb)->has_rxtstamp =
1968 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1969}
1970
1da177e4
LT
1971/*
1972 * From tcp_input.c
1973 */
1974
1975int tcp_v4_rcv(struct sk_buff *skb)
1976{
3b24d854 1977 struct net *net = dev_net(skb->dev);
643b622b 1978 enum skb_drop_reason drop_reason;
3fa6f616 1979 int sdif = inet_sdif(skb);
534322ca 1980 int dif = inet_iif(skb);
eddc9ec5 1981 const struct iphdr *iph;
cf533ea5 1982 const struct tcphdr *th;
3b24d854 1983 bool refcounted;
1da177e4
LT
1984 struct sock *sk;
1985 int ret;
1986
85125597 1987 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1da177e4
LT
1988 if (skb->pkt_type != PACKET_HOST)
1989 goto discard_it;
1990
1991 /* Count it even if it's bad */
90bbcc60 1992 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1da177e4
LT
1993
1994 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1995 goto discard_it;
1996
ea1627c2 1997 th = (const struct tcphdr *)skb->data;
1da177e4 1998
85125597
MD
1999 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2000 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1da177e4 2001 goto bad_packet;
85125597 2002 }
1da177e4
LT
2003 if (!pskb_may_pull(skb, th->doff * 4))
2004 goto discard_it;
2005
2006 /* An explanation is required here, I think.
2007 * Packet length and doff are validated by header prediction,
caa20d9a 2008 * provided case of th->doff==0 is eliminated.
1da177e4 2009 * So, we defer the checks. */
ed70fcfc
TH
2010
2011 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
6a5dc9e5 2012 goto csum_error;
1da177e4 2013
ea1627c2 2014 th = (const struct tcphdr *)skb->data;
eddc9ec5 2015 iph = ip_hdr(skb);
4bdc3d66 2016lookup:
4461568a
KI
2017 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2018 skb, __tcp_hdrlen(th), th->source,
3fa6f616 2019 th->dest, sdif, &refcounted);
1da177e4
LT
2020 if (!sk)
2021 goto no_tcp_socket;
2022
bb134d5d
ED
2023process:
2024 if (sk->sk_state == TCP_TIME_WAIT)
2025 goto do_time_wait;
2026
079096f1
ED
2027 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2028 struct request_sock *req = inet_reqsk(sk);
e0f9759f 2029 bool req_stolen = false;
7716682c 2030 struct sock *nsk;
079096f1
ED
2031
2032 sk = req->rsk_listener;
6f0012e3
ED
2033 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2034 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2035 else
2036 drop_reason = tcp_inbound_md5_hash(sk, skb,
1330b6ef
JK
2037 &iph->saddr, &iph->daddr,
2038 AF_INET, dif, sdif);
2039 if (unlikely(drop_reason)) {
e65c332d 2040 sk_drops_add(sk, skb);
72923555
ED
2041 reqsk_put(req);
2042 goto discard_it;
2043 }
4fd44a98
FL
2044 if (tcp_checksum_complete(skb)) {
2045 reqsk_put(req);
2046 goto csum_error;
2047 }
7716682c 2048 if (unlikely(sk->sk_state != TCP_LISTEN)) {
d4f2c86b
KI
2049 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2050 if (!nsk) {
2051 inet_csk_reqsk_queue_drop_and_put(sk, req);
2052 goto lookup;
2053 }
2054 sk = nsk;
2055 /* reuseport_migrate_sock() has already held one sk_refcnt
2056 * before returning.
2057 */
2058 } else {
2059 /* We own a reference on the listener, increase it again
2060 * as we might lose it too soon.
2061 */
2062 sock_hold(sk);
4bdc3d66 2063 }
3b24d854 2064 refcounted = true;
1f3b359f 2065 nsk = NULL;
eeea10b8
ED
2066 if (!tcp_filter(sk, skb)) {
2067 th = (const struct tcphdr *)skb->data;
2068 iph = ip_hdr(skb);
2069 tcp_v4_fill_cb(skb, iph, th);
e0f9759f 2070 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
255f9034
MD
2071 } else {
2072 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
eeea10b8 2073 }
079096f1
ED
2074 if (!nsk) {
2075 reqsk_put(req);
e0f9759f
ED
2076 if (req_stolen) {
2077 /* Another cpu got exclusive access to req
2078 * and created a full blown socket.
2079 * Try to feed this packet to this socket
2080 * instead of discarding it.
2081 */
2082 tcp_v4_restore_cb(skb);
2083 sock_put(sk);
2084 goto lookup;
2085 }
7716682c 2086 goto discard_and_relse;
079096f1 2087 }
6f0012e3 2088 nf_reset_ct(skb);
079096f1 2089 if (nsk == sk) {
079096f1 2090 reqsk_put(req);
eeea10b8 2091 tcp_v4_restore_cb(skb);
079096f1
ED
2092 } else if (tcp_child_process(sk, nsk, skb)) {
2093 tcp_v4_send_reset(nsk, skb);
7716682c 2094 goto discard_and_relse;
079096f1 2095 } else {
7716682c 2096 sock_put(sk);
079096f1
ED
2097 return 0;
2098 }
2099 }
14834c4f 2100
020e71a3
ED
2101 if (static_branch_unlikely(&ip4_min_ttl)) {
2102 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2103 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2104 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2105 goto discard_and_relse;
2106 }
6cce09f8 2107 }
d218d111 2108
255f9034
MD
2109 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2110 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1da177e4 2111 goto discard_and_relse;
255f9034 2112 }
9ea88a15 2113
1330b6ef
JK
2114 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2115 &iph->daddr, AF_INET, dif, sdif);
2116 if (drop_reason)
9ea88a15 2117 goto discard_and_relse;
9ea88a15 2118
895b5c9f 2119 nf_reset_ct(skb);
1da177e4 2120
85125597 2121 if (tcp_filter(sk, skb)) {
364df53c 2122 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
1da177e4 2123 goto discard_and_relse;
85125597 2124 }
ac6e7800
ED
2125 th = (const struct tcphdr *)skb->data;
2126 iph = ip_hdr(skb);
eeea10b8 2127 tcp_v4_fill_cb(skb, iph, th);
1da177e4
LT
2128
2129 skb->dev = NULL;
2130
e994b2f0
ED
2131 if (sk->sk_state == TCP_LISTEN) {
2132 ret = tcp_v4_do_rcv(sk, skb);
2133 goto put_and_return;
2134 }
2135
2136 sk_incoming_cpu_update(sk);
2137
c6366184 2138 bh_lock_sock_nested(sk);
a44d6eac 2139 tcp_segs_in(tcp_sk(sk), skb);
1da177e4
LT
2140 ret = 0;
2141 if (!sock_owned_by_user(sk)) {
e7942d06 2142 ret = tcp_v4_do_rcv(sk, skb);
8b27dae5 2143 } else {
7a26dc9e 2144 if (tcp_add_backlog(sk, skb, &drop_reason))
8b27dae5 2145 goto discard_and_relse;
6b03a53a 2146 }
1da177e4
LT
2147 bh_unlock_sock(sk);
2148
e994b2f0 2149put_and_return:
3b24d854
ED
2150 if (refcounted)
2151 sock_put(sk);
1da177e4
LT
2152
2153 return ret;
2154
2155no_tcp_socket:
85125597 2156 drop_reason = SKB_DROP_REASON_NO_SOCKET;
1da177e4
LT
2157 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2158 goto discard_it;
2159
eeea10b8
ED
2160 tcp_v4_fill_cb(skb, iph, th);
2161
12e25e10 2162 if (tcp_checksum_complete(skb)) {
6a5dc9e5 2163csum_error:
85125597 2164 drop_reason = SKB_DROP_REASON_TCP_CSUM;
709c0314 2165 trace_tcp_bad_csum(skb);
90bbcc60 2166 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1da177e4 2167bad_packet:
90bbcc60 2168 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1da177e4 2169 } else {
cfb6eeb4 2170 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
2171 }
2172
2173discard_it:
f8319dfd 2174 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
1da177e4 2175 /* Discard frame. */
85125597 2176 kfree_skb_reason(skb, drop_reason);
e905a9ed 2177 return 0;
1da177e4
LT
2178
2179discard_and_relse:
532182cd 2180 sk_drops_add(sk, skb);
3b24d854
ED
2181 if (refcounted)
2182 sock_put(sk);
1da177e4
LT
2183 goto discard_it;
2184
2185do_time_wait:
2186 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
255f9034 2187 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
9469c7b4 2188 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
2189 goto discard_it;
2190 }
2191
eeea10b8
ED
2192 tcp_v4_fill_cb(skb, iph, th);
2193
6a5dc9e5
ED
2194 if (tcp_checksum_complete(skb)) {
2195 inet_twsk_put(inet_twsk(sk));
2196 goto csum_error;
1da177e4 2197 }
9469c7b4 2198 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 2199 case TCP_TW_SYN: {
4461568a
KI
2200 struct sock *sk2 = inet_lookup_listener(net,
2201 net->ipv4.tcp_death_row.hashinfo,
2202 skb, __tcp_hdrlen(th),
da5e3630 2203 iph->saddr, th->source,
eddc9ec5 2204 iph->daddr, th->dest,
3fa6f616
DA
2205 inet_iif(skb),
2206 sdif);
1da177e4 2207 if (sk2) {
dbe7faa4 2208 inet_twsk_deschedule_put(inet_twsk(sk));
1da177e4 2209 sk = sk2;
eeea10b8 2210 tcp_v4_restore_cb(skb);
3b24d854 2211 refcounted = false;
1da177e4
LT
2212 goto process;
2213 }
1da177e4 2214 }
fcfd6dfa 2215 /* to ACK */
a8eceea8 2216 fallthrough;
1da177e4
LT
2217 case TCP_TW_ACK:
2218 tcp_v4_timewait_ack(sk, skb);
2219 break;
2220 case TCP_TW_RST:
271c3b9b
FW
2221 tcp_v4_send_reset(sk, skb);
2222 inet_twsk_deschedule_put(inet_twsk(sk));
2223 goto discard_it;
1da177e4
LT
2224 case TCP_TW_SUCCESS:;
2225 }
2226 goto discard_it;
2227}
2228
ccb7c410
DM
2229static struct timewait_sock_ops tcp_timewait_sock_ops = {
2230 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2231 .twsk_unique = tcp_twsk_unique,
2232 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 2233};
1da177e4 2234
63d02d15 2235void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
2236{
2237 struct dst_entry *dst = skb_dst(skb);
2238
5037e9ef 2239 if (dst && dst_hold_safe(dst)) {
8f905c0e 2240 rcu_assign_pointer(sk->sk_rx_dst, dst);
0c0a5ef8 2241 sk->sk_rx_dst_ifindex = skb->skb_iif;
ca777eff 2242 }
5d299f3d 2243}
63d02d15 2244EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 2245
3b401a81 2246const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
2247 .queue_xmit = ip_queue_xmit,
2248 .send_check = tcp_v4_send_check,
2249 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 2250 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
2251 .conn_request = tcp_v4_conn_request,
2252 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
2253 .net_header_len = sizeof(struct iphdr),
2254 .setsockopt = ip_setsockopt,
2255 .getsockopt = ip_getsockopt,
2256 .addr2sockaddr = inet_csk_addr2sockaddr,
2257 .sockaddr_len = sizeof(struct sockaddr_in),
4fab9071 2258 .mtu_reduced = tcp_v4_mtu_reduced,
1da177e4 2259};
4bc2f18b 2260EXPORT_SYMBOL(ipv4_specific);
1da177e4 2261
cfb6eeb4 2262#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 2263static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 2264 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 2265 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 2266 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 2267};
b6332e6c 2268#endif
cfb6eeb4 2269
1da177e4
LT
2270/* NOTE: A lot of things set to zero explicitly by call to
2271 * sk_alloc() so need not be done here.
2272 */
2273static int tcp_v4_init_sock(struct sock *sk)
2274{
6687e988 2275 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 2276
900f65d3 2277 tcp_init_sock(sk);
1da177e4 2278
8292a17a 2279 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 2280
cfb6eeb4 2281#ifdef CONFIG_TCP_MD5SIG
ac807fa8 2282 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 2283#endif
1da177e4 2284
1da177e4
LT
2285 return 0;
2286}
2287
7d06b2e0 2288void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
2289{
2290 struct tcp_sock *tp = tcp_sk(sk);
2291
e1a4aa50
SL
2292 trace_tcp_destroy_sock(sk);
2293
1da177e4
LT
2294 tcp_clear_xmit_timers(sk);
2295
6687e988 2296 tcp_cleanup_congestion_control(sk);
317a76f9 2297
734942cc
DW
2298 tcp_cleanup_ulp(sk);
2299
1da177e4 2300 /* Cleanup up the write buffer. */
fe067e8a 2301 tcp_write_queue_purge(sk);
1da177e4 2302
cf1ef3f0
WW
2303 /* Check if we want to disable active TFO */
2304 tcp_fastopen_active_disable_ofo_check(sk);
2305
1da177e4 2306 /* Cleans up our, hopefully empty, out_of_order_queue. */
9f5afeae 2307 skb_rbtree_purge(&tp->out_of_order_queue);
1da177e4 2308
cfb6eeb4
YH
2309#ifdef CONFIG_TCP_MD5SIG
2310 /* Clean up the MD5 key list, if any */
2311 if (tp->md5sig_info) {
a915da9b 2312 tcp_clear_md5_list(sk);
fb7df5e4 2313 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
cfb6eeb4 2314 tp->md5sig_info = NULL;
459837b5 2315 static_branch_slow_dec_deferred(&tcp_md5_needed);
cfb6eeb4
YH
2316 }
2317#endif
1a2449a8 2318
1da177e4 2319 /* Clean up a referenced TCP bind bucket. */
463c84b9 2320 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 2321 inet_put_port(sk);
1da177e4 2322
d983ea6f 2323 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
435cf559 2324
cf60af03
YC
2325 /* If socket is aborted during connect operation */
2326 tcp_free_fastopen_req(tp);
1fba70e5 2327 tcp_fastopen_destroy_cipher(sk);
cd8ae852 2328 tcp_saved_syn_free(tp);
cf60af03 2329
180d8cd9 2330 sk_sockets_allocated_dec(sk);
1da177e4 2331}
1da177e4
LT
2332EXPORT_SYMBOL(tcp_v4_destroy_sock);
2333
2334#ifdef CONFIG_PROC_FS
2335/* Proc filesystem TCP sock list dumping. */
2336
ad2d6137
MKL
2337static unsigned short seq_file_family(const struct seq_file *seq);
2338
2339static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2340{
2341 unsigned short family = seq_file_family(seq);
2342
2343 /* AF_UNSPEC is used as a match all */
2344 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2345 net_eq(sock_net(sk), seq_file_net(seq)));
2346}
2347
b72acf45
MKL
2348/* Find a non empty bucket (starting from st->bucket)
2349 * and return the first sk from it.
a8b690f9 2350 */
b72acf45 2351static void *listening_get_first(struct seq_file *seq)
1da177e4 2352{
4461568a 2353 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
5799de0b 2354 struct tcp_iter_state *st = seq->private;
1da177e4 2355
b72acf45 2356 st->offset = 0;
4461568a 2357 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
05c0b357 2358 struct inet_listen_hashbucket *ilb2;
cae3873c 2359 struct hlist_nulls_node *node;
b72acf45 2360 struct sock *sk;
b08d4d3b 2361
4461568a 2362 ilb2 = &hinfo->lhash2[st->bucket];
cae3873c 2363 if (hlist_nulls_empty(&ilb2->nulls_head))
b72acf45
MKL
2364 continue;
2365
05c0b357 2366 spin_lock(&ilb2->lock);
cae3873c 2367 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
b72acf45
MKL
2368 if (seq_sk_match(seq, sk))
2369 return sk;
2370 }
05c0b357 2371 spin_unlock(&ilb2->lock);
1da177e4 2372 }
b72acf45
MKL
2373
2374 return NULL;
2375}
2376
2377/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2378 * If "cur" is the last one in the st->bucket,
2379 * call listening_get_first() to return the first sk of the next
2380 * non empty bucket.
a8b690f9 2381 */
1da177e4
LT
2382static void *listening_get_next(struct seq_file *seq, void *cur)
2383{
5799de0b 2384 struct tcp_iter_state *st = seq->private;
05c0b357 2385 struct inet_listen_hashbucket *ilb2;
cae3873c 2386 struct hlist_nulls_node *node;
4461568a 2387 struct inet_hashinfo *hinfo;
3b24d854 2388 struct sock *sk = cur;
1da177e4 2389
1da177e4 2390 ++st->num;
a8b690f9 2391 ++st->offset;
1da177e4 2392
cae3873c
MKL
2393 sk = sk_nulls_next(sk);
2394 sk_nulls_for_each_from(sk, node) {
ad2d6137 2395 if (seq_sk_match(seq, sk))
3b24d854 2396 return sk;
1da177e4 2397 }
b72acf45 2398
4461568a
KI
2399 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2400 ilb2 = &hinfo->lhash2[st->bucket];
05c0b357 2401 spin_unlock(&ilb2->lock);
b72acf45
MKL
2402 ++st->bucket;
2403 return listening_get_first(seq);
1da177e4
LT
2404}
2405
2406static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2407{
a8b690f9
TH
2408 struct tcp_iter_state *st = seq->private;
2409 void *rc;
2410
2411 st->bucket = 0;
2412 st->offset = 0;
b72acf45 2413 rc = listening_get_first(seq);
1da177e4
LT
2414
2415 while (rc && *pos) {
2416 rc = listening_get_next(seq, rc);
2417 --*pos;
2418 }
2419 return rc;
2420}
2421
4461568a
KI
2422static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2423 const struct tcp_iter_state *st)
6eac5604 2424{
4461568a 2425 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
6eac5604
AK
2426}
2427
a8b690f9
TH
2428/*
2429 * Get first established socket starting from bucket given in st->bucket.
2430 * If st->bucket is zero, the very first socket in the hash is returned.
2431 */
1da177e4
LT
2432static void *established_get_first(struct seq_file *seq)
2433{
4461568a 2434 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
5799de0b 2435 struct tcp_iter_state *st = seq->private;
b08d4d3b 2436
a8b690f9 2437 st->offset = 0;
4461568a 2438 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
1da177e4 2439 struct sock *sk;
3ab5aee7 2440 struct hlist_nulls_node *node;
4461568a 2441 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
1da177e4 2442
6eac5604 2443 /* Lockless fast path for the common case of empty buckets */
4461568a 2444 if (empty_bucket(hinfo, st))
6eac5604
AK
2445 continue;
2446
9db66bdc 2447 spin_lock_bh(lock);
4461568a 2448 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
ad2d6137
MKL
2449 if (seq_sk_match(seq, sk))
2450 return sk;
1da177e4 2451 }
9db66bdc 2452 spin_unlock_bh(lock);
1da177e4 2453 }
ad2d6137
MKL
2454
2455 return NULL;
1da177e4
LT
2456}
2457
2458static void *established_get_next(struct seq_file *seq, void *cur)
2459{
4461568a 2460 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
5799de0b 2461 struct tcp_iter_state *st = seq->private;
08eaef90
KI
2462 struct hlist_nulls_node *node;
2463 struct sock *sk = cur;
b08d4d3b 2464
1da177e4 2465 ++st->num;
a8b690f9 2466 ++st->offset;
1da177e4 2467
05dbc7b5 2468 sk = sk_nulls_next(sk);
1da177e4 2469
3ab5aee7 2470 sk_nulls_for_each_from(sk, node) {
ad2d6137 2471 if (seq_sk_match(seq, sk))
05dbc7b5 2472 return sk;
1da177e4
LT
2473 }
2474
4461568a 2475 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
05dbc7b5
ED
2476 ++st->bucket;
2477 return established_get_first(seq);
1da177e4
LT
2478}
2479
2480static void *established_get_idx(struct seq_file *seq, loff_t pos)
2481{
a8b690f9
TH
2482 struct tcp_iter_state *st = seq->private;
2483 void *rc;
2484
2485 st->bucket = 0;
2486 rc = established_get_first(seq);
1da177e4
LT
2487
2488 while (rc && pos) {
2489 rc = established_get_next(seq, rc);
2490 --pos;
7174259e 2491 }
1da177e4
LT
2492 return rc;
2493}
2494
2495static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2496{
2497 void *rc;
5799de0b 2498 struct tcp_iter_state *st = seq->private;
1da177e4 2499
1da177e4
LT
2500 st->state = TCP_SEQ_STATE_LISTENING;
2501 rc = listening_get_idx(seq, &pos);
2502
2503 if (!rc) {
1da177e4
LT
2504 st->state = TCP_SEQ_STATE_ESTABLISHED;
2505 rc = established_get_idx(seq, pos);
2506 }
2507
2508 return rc;
2509}
2510
a8b690f9
TH
2511static void *tcp_seek_last_pos(struct seq_file *seq)
2512{
4461568a 2513 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
a8b690f9 2514 struct tcp_iter_state *st = seq->private;
525e2f9f 2515 int bucket = st->bucket;
a8b690f9
TH
2516 int offset = st->offset;
2517 int orig_num = st->num;
2518 void *rc = NULL;
2519
2520 switch (st->state) {
a8b690f9 2521 case TCP_SEQ_STATE_LISTENING:
4461568a 2522 if (st->bucket > hinfo->lhash2_mask)
a8b690f9 2523 break;
b72acf45 2524 rc = listening_get_first(seq);
525e2f9f 2525 while (offset-- && rc && bucket == st->bucket)
a8b690f9
TH
2526 rc = listening_get_next(seq, rc);
2527 if (rc)
2528 break;
2529 st->bucket = 0;
05dbc7b5 2530 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8eceea8 2531 fallthrough;
a8b690f9 2532 case TCP_SEQ_STATE_ESTABLISHED:
4461568a 2533 if (st->bucket > hinfo->ehash_mask)
a8b690f9
TH
2534 break;
2535 rc = established_get_first(seq);
525e2f9f 2536 while (offset-- && rc && bucket == st->bucket)
a8b690f9
TH
2537 rc = established_get_next(seq, rc);
2538 }
2539
2540 st->num = orig_num;
2541
2542 return rc;
2543}
2544
37d849bb 2545void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1da177e4 2546{
5799de0b 2547 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2548 void *rc;
2549
2550 if (*pos && *pos == st->last_pos) {
2551 rc = tcp_seek_last_pos(seq);
2552 if (rc)
2553 goto out;
2554 }
2555
1da177e4
LT
2556 st->state = TCP_SEQ_STATE_LISTENING;
2557 st->num = 0;
a8b690f9
TH
2558 st->bucket = 0;
2559 st->offset = 0;
2560 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2561
2562out:
2563 st->last_pos = *pos;
2564 return rc;
1da177e4 2565}
37d849bb 2566EXPORT_SYMBOL(tcp_seq_start);
1da177e4 2567
37d849bb 2568void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1da177e4 2569{
a8b690f9 2570 struct tcp_iter_state *st = seq->private;
1da177e4 2571 void *rc = NULL;
1da177e4
LT
2572
2573 if (v == SEQ_START_TOKEN) {
2574 rc = tcp_get_idx(seq, 0);
2575 goto out;
2576 }
1da177e4
LT
2577
2578 switch (st->state) {
1da177e4
LT
2579 case TCP_SEQ_STATE_LISTENING:
2580 rc = listening_get_next(seq, v);
2581 if (!rc) {
1da177e4 2582 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2583 st->bucket = 0;
2584 st->offset = 0;
1da177e4
LT
2585 rc = established_get_first(seq);
2586 }
2587 break;
2588 case TCP_SEQ_STATE_ESTABLISHED:
1da177e4
LT
2589 rc = established_get_next(seq, v);
2590 break;
2591 }
2592out:
2593 ++*pos;
a8b690f9 2594 st->last_pos = *pos;
1da177e4
LT
2595 return rc;
2596}
37d849bb 2597EXPORT_SYMBOL(tcp_seq_next);
1da177e4 2598
37d849bb 2599void tcp_seq_stop(struct seq_file *seq, void *v)
1da177e4 2600{
4461568a 2601 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
5799de0b 2602 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2603
2604 switch (st->state) {
1da177e4
LT
2605 case TCP_SEQ_STATE_LISTENING:
2606 if (v != SEQ_START_TOKEN)
4461568a 2607 spin_unlock(&hinfo->lhash2[st->bucket].lock);
1da177e4 2608 break;
1da177e4
LT
2609 case TCP_SEQ_STATE_ESTABLISHED:
2610 if (v)
4461568a 2611 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
1da177e4
LT
2612 break;
2613 }
2614}
37d849bb 2615EXPORT_SYMBOL(tcp_seq_stop);
1da177e4 2616
d4f06873 2617static void get_openreq4(const struct request_sock *req,
aa3a0c8c 2618 struct seq_file *f, int i)
1da177e4 2619{
2e6599cb 2620 const struct inet_request_sock *ireq = inet_rsk(req);
fa76ce73 2621 long delta = req->rsk_timer.expires - jiffies;
1da177e4 2622
5e659e4c 2623 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2624 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
1da177e4 2625 i,
634fb979 2626 ireq->ir_loc_addr,
d4f06873 2627 ireq->ir_num,
634fb979
ED
2628 ireq->ir_rmt_addr,
2629 ntohs(ireq->ir_rmt_port),
1da177e4
LT
2630 TCP_SYN_RECV,
2631 0, 0, /* could print option size, but that is af dependent. */
2632 1, /* timers active (only the expire timer) */
a399a805 2633 jiffies_delta_to_clock_t(delta),
e6c022a4 2634 req->num_timeout,
aa3a0c8c
ED
2635 from_kuid_munged(seq_user_ns(f),
2636 sock_i_uid(req->rsk_listener)),
1da177e4
LT
2637 0, /* non standard timer */
2638 0, /* open_requests have no inode */
d4f06873 2639 0,
652586df 2640 req);
1da177e4
LT
2641}
2642
652586df 2643static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
1da177e4
LT
2644{
2645 int timer_active;
2646 unsigned long timer_expires;
cf533ea5 2647 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2648 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2649 const struct inet_sock *inet = inet_sk(sk);
0536fcc0 2650 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2651 __be32 dest = inet->inet_daddr;
2652 __be32 src = inet->inet_rcv_saddr;
2653 __u16 destp = ntohs(inet->inet_dport);
2654 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2655 int rx_queue;
00fd38d9 2656 int state;
1da177e4 2657
6ba8a3b1 2658 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
57dde7f7 2659 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
6ba8a3b1 2660 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2661 timer_active = 1;
463c84b9
ACM
2662 timer_expires = icsk->icsk_timeout;
2663 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2664 timer_active = 4;
463c84b9 2665 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2666 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2667 timer_active = 2;
cf4c6bf8 2668 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2669 } else {
2670 timer_active = 0;
2671 timer_expires = jiffies;
2672 }
2673
986ffdfd 2674 state = inet_sk_state_load(sk);
00fd38d9 2675 if (state == TCP_LISTEN)
288efe86 2676 rx_queue = READ_ONCE(sk->sk_ack_backlog);
49d09007 2677 else
00fd38d9
ED
2678 /* Because we don't lock the socket,
2679 * we might find a transient negative value.
49d09007 2680 */
dba7d9b8 2681 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
7db48e98 2682 READ_ONCE(tp->copied_seq), 0);
49d09007 2683
5e659e4c 2684 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
652586df 2685 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
00fd38d9 2686 i, src, srcp, dest, destp, state,
0f317464 2687 READ_ONCE(tp->write_seq) - tp->snd_una,
49d09007 2688 rx_queue,
1da177e4 2689 timer_active,
a399a805 2690 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2691 icsk->icsk_retransmits,
a7cb5a49 2692 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2693 icsk->icsk_probes_out,
cf4c6bf8 2694 sock_i_ino(sk),
41c6d650 2695 refcount_read(&sk->sk_refcnt), sk,
7be87351
SH
2696 jiffies_to_clock_t(icsk->icsk_rto),
2697 jiffies_to_clock_t(icsk->icsk_ack.ato),
31954cd8 2698 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
40570375 2699 tcp_snd_cwnd(tp),
00fd38d9
ED
2700 state == TCP_LISTEN ?
2701 fastopenq->max_qlen :
652586df 2702 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
1da177e4
LT
2703}
2704
cf533ea5 2705static void get_timewait4_sock(const struct inet_timewait_sock *tw,
652586df 2706 struct seq_file *f, int i)
1da177e4 2707{
789f558c 2708 long delta = tw->tw_timer.expires - jiffies;
23f33c2d 2709 __be32 dest, src;
1da177e4 2710 __u16 destp, srcp;
1da177e4
LT
2711
2712 dest = tw->tw_daddr;
2713 src = tw->tw_rcv_saddr;
2714 destp = ntohs(tw->tw_dport);
2715 srcp = ntohs(tw->tw_sport);
2716
5e659e4c 2717 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2718 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
1da177e4 2719 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2720 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
41c6d650 2721 refcount_read(&tw->tw_refcnt), tw);
1da177e4
LT
2722}
2723
2724#define TMPSZ 150
2725
2726static int tcp4_seq_show(struct seq_file *seq, void *v)
2727{
5799de0b 2728 struct tcp_iter_state *st;
05dbc7b5 2729 struct sock *sk = v;
1da177e4 2730
652586df 2731 seq_setwidth(seq, TMPSZ - 1);
1da177e4 2732 if (v == SEQ_START_TOKEN) {
652586df 2733 seq_puts(seq, " sl local_address rem_address st tx_queue "
1da177e4
LT
2734 "rx_queue tr tm->when retrnsmt uid timeout "
2735 "inode");
2736 goto out;
2737 }
2738 st = seq->private;
2739
079096f1
ED
2740 if (sk->sk_state == TCP_TIME_WAIT)
2741 get_timewait4_sock(v, seq, st->num);
2742 else if (sk->sk_state == TCP_NEW_SYN_RECV)
aa3a0c8c 2743 get_openreq4(v, seq, st->num);
079096f1
ED
2744 else
2745 get_tcp4_sock(v, seq, st->num);
1da177e4 2746out:
652586df 2747 seq_pad(seq, '\n');
1da177e4
LT
2748 return 0;
2749}
2750
52d87d5f 2751#ifdef CONFIG_BPF_SYSCALL
04c7820b
MKL
2752struct bpf_tcp_iter_state {
2753 struct tcp_iter_state state;
2754 unsigned int cur_sk;
2755 unsigned int end_sk;
2756 unsigned int max_sk;
2757 struct sock **batch;
2758 bool st_bucket_done;
2759};
2760
52d87d5f
YS
2761struct bpf_iter__tcp {
2762 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2763 __bpf_md_ptr(struct sock_common *, sk_common);
2764 uid_t uid __aligned(8);
2765};
2766
2767static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2768 struct sock_common *sk_common, uid_t uid)
2769{
2770 struct bpf_iter__tcp ctx;
2771
2772 meta->seq_num--; /* skip SEQ_START_TOKEN */
2773 ctx.meta = meta;
2774 ctx.sk_common = sk_common;
2775 ctx.uid = uid;
2776 return bpf_iter_run_prog(prog, &ctx);
2777}
2778
04c7820b
MKL
2779static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2780{
2781 while (iter->cur_sk < iter->end_sk)
2782 sock_put(iter->batch[iter->cur_sk++]);
2783}
2784
2785static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2786 unsigned int new_batch_sz)
2787{
2788 struct sock **new_batch;
2789
2790 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2791 GFP_USER | __GFP_NOWARN);
2792 if (!new_batch)
2793 return -ENOMEM;
2794
2795 bpf_iter_tcp_put_batch(iter);
2796 kvfree(iter->batch);
2797 iter->batch = new_batch;
2798 iter->max_sk = new_batch_sz;
2799
2800 return 0;
2801}
2802
2803static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2804 struct sock *start_sk)
2805{
4461568a 2806 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
04c7820b
MKL
2807 struct bpf_tcp_iter_state *iter = seq->private;
2808 struct tcp_iter_state *st = &iter->state;
cae3873c 2809 struct hlist_nulls_node *node;
04c7820b
MKL
2810 unsigned int expected = 1;
2811 struct sock *sk;
2812
2813 sock_hold(start_sk);
2814 iter->batch[iter->end_sk++] = start_sk;
2815
cae3873c
MKL
2816 sk = sk_nulls_next(start_sk);
2817 sk_nulls_for_each_from(sk, node) {
04c7820b
MKL
2818 if (seq_sk_match(seq, sk)) {
2819 if (iter->end_sk < iter->max_sk) {
2820 sock_hold(sk);
2821 iter->batch[iter->end_sk++] = sk;
2822 }
2823 expected++;
2824 }
2825 }
4461568a 2826 spin_unlock(&hinfo->lhash2[st->bucket].lock);
04c7820b
MKL
2827
2828 return expected;
2829}
2830
2831static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2832 struct sock *start_sk)
2833{
4461568a 2834 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
04c7820b
MKL
2835 struct bpf_tcp_iter_state *iter = seq->private;
2836 struct tcp_iter_state *st = &iter->state;
2837 struct hlist_nulls_node *node;
2838 unsigned int expected = 1;
2839 struct sock *sk;
2840
2841 sock_hold(start_sk);
2842 iter->batch[iter->end_sk++] = start_sk;
2843
2844 sk = sk_nulls_next(start_sk);
2845 sk_nulls_for_each_from(sk, node) {
2846 if (seq_sk_match(seq, sk)) {
2847 if (iter->end_sk < iter->max_sk) {
2848 sock_hold(sk);
2849 iter->batch[iter->end_sk++] = sk;
2850 }
2851 expected++;
2852 }
2853 }
4461568a 2854 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
04c7820b
MKL
2855
2856 return expected;
2857}
2858
2859static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2860{
4461568a 2861 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
04c7820b
MKL
2862 struct bpf_tcp_iter_state *iter = seq->private;
2863 struct tcp_iter_state *st = &iter->state;
2864 unsigned int expected;
2865 bool resized = false;
2866 struct sock *sk;
2867
2868 /* The st->bucket is done. Directly advance to the next
2869 * bucket instead of having the tcp_seek_last_pos() to skip
2870 * one by one in the current bucket and eventually find out
2871 * it has to advance to the next bucket.
2872 */
2873 if (iter->st_bucket_done) {
2874 st->offset = 0;
2875 st->bucket++;
2876 if (st->state == TCP_SEQ_STATE_LISTENING &&
4461568a 2877 st->bucket > hinfo->lhash2_mask) {
04c7820b
MKL
2878 st->state = TCP_SEQ_STATE_ESTABLISHED;
2879 st->bucket = 0;
2880 }
2881 }
2882
2883again:
2884 /* Get a new batch */
2885 iter->cur_sk = 0;
2886 iter->end_sk = 0;
2887 iter->st_bucket_done = false;
2888
2889 sk = tcp_seek_last_pos(seq);
2890 if (!sk)
2891 return NULL; /* Done */
2892
2893 if (st->state == TCP_SEQ_STATE_LISTENING)
2894 expected = bpf_iter_tcp_listening_batch(seq, sk);
2895 else
2896 expected = bpf_iter_tcp_established_batch(seq, sk);
2897
2898 if (iter->end_sk == expected) {
2899 iter->st_bucket_done = true;
2900 return sk;
2901 }
2902
2903 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2904 resized = true;
2905 goto again;
2906 }
2907
2908 return sk;
2909}
2910
2911static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2912{
2913 /* bpf iter does not support lseek, so it always
2914 * continue from where it was stop()-ped.
2915 */
2916 if (*pos)
2917 return bpf_iter_tcp_batch(seq);
2918
2919 return SEQ_START_TOKEN;
2920}
2921
2922static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2923{
2924 struct bpf_tcp_iter_state *iter = seq->private;
2925 struct tcp_iter_state *st = &iter->state;
2926 struct sock *sk;
2927
2928 /* Whenever seq_next() is called, the iter->cur_sk is
2929 * done with seq_show(), so advance to the next sk in
2930 * the batch.
2931 */
2932 if (iter->cur_sk < iter->end_sk) {
2933 /* Keeping st->num consistent in tcp_iter_state.
2934 * bpf_iter_tcp does not use st->num.
2935 * meta.seq_num is used instead.
2936 */
2937 st->num++;
2938 /* Move st->offset to the next sk in the bucket such that
2939 * the future start() will resume at st->offset in
2940 * st->bucket. See tcp_seek_last_pos().
2941 */
2942 st->offset++;
2943 sock_put(iter->batch[iter->cur_sk++]);
2944 }
2945
2946 if (iter->cur_sk < iter->end_sk)
2947 sk = iter->batch[iter->cur_sk];
2948 else
2949 sk = bpf_iter_tcp_batch(seq);
2950
2951 ++*pos;
2952 /* Keeping st->last_pos consistent in tcp_iter_state.
2953 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2954 */
2955 st->last_pos = *pos;
2956 return sk;
2957}
2958
52d87d5f
YS
2959static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2960{
2961 struct bpf_iter_meta meta;
2962 struct bpf_prog *prog;
2963 struct sock *sk = v;
04c7820b 2964 bool slow;
52d87d5f 2965 uid_t uid;
04c7820b 2966 int ret;
52d87d5f
YS
2967
2968 if (v == SEQ_START_TOKEN)
2969 return 0;
2970
04c7820b
MKL
2971 if (sk_fullsock(sk))
2972 slow = lock_sock_fast(sk);
2973
2974 if (unlikely(sk_unhashed(sk))) {
2975 ret = SEQ_SKIP;
2976 goto unlock;
2977 }
2978
52d87d5f
YS
2979 if (sk->sk_state == TCP_TIME_WAIT) {
2980 uid = 0;
2981 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2982 const struct request_sock *req = v;
2983
2984 uid = from_kuid_munged(seq_user_ns(seq),
2985 sock_i_uid(req->rsk_listener));
2986 } else {
2987 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2988 }
2989
2990 meta.seq = seq;
2991 prog = bpf_iter_get_info(&meta, false);
04c7820b
MKL
2992 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2993
2994unlock:
2995 if (sk_fullsock(sk))
2996 unlock_sock_fast(sk, slow);
2997 return ret;
2998
52d87d5f
YS
2999}
3000
3001static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3002{
04c7820b 3003 struct bpf_tcp_iter_state *iter = seq->private;
52d87d5f
YS
3004 struct bpf_iter_meta meta;
3005 struct bpf_prog *prog;
3006
3007 if (!v) {
3008 meta.seq = seq;
3009 prog = bpf_iter_get_info(&meta, true);
3010 if (prog)
3011 (void)tcp_prog_seq_show(prog, &meta, v, 0);
3012 }
3013
04c7820b
MKL
3014 if (iter->cur_sk < iter->end_sk) {
3015 bpf_iter_tcp_put_batch(iter);
3016 iter->st_bucket_done = false;
3017 }
52d87d5f
YS
3018}
3019
3020static const struct seq_operations bpf_iter_tcp_seq_ops = {
3021 .show = bpf_iter_tcp_seq_show,
04c7820b
MKL
3022 .start = bpf_iter_tcp_seq_start,
3023 .next = bpf_iter_tcp_seq_next,
52d87d5f
YS
3024 .stop = bpf_iter_tcp_seq_stop,
3025};
3026#endif
ad2d6137
MKL
3027static unsigned short seq_file_family(const struct seq_file *seq)
3028{
62001372 3029 const struct tcp_seq_afinfo *afinfo;
ad2d6137 3030
62001372 3031#ifdef CONFIG_BPF_SYSCALL
ad2d6137 3032 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
62001372 3033 if (seq->op == &bpf_iter_tcp_seq_ops)
ad2d6137 3034 return AF_UNSPEC;
52d87d5f
YS
3035#endif
3036
ad2d6137 3037 /* Iterated from proc fs */
359745d7 3038 afinfo = pde_data(file_inode(seq->file));
ad2d6137
MKL
3039 return afinfo->family;
3040}
52d87d5f 3041
37d849bb
CH
3042static const struct seq_operations tcp4_seq_ops = {
3043 .show = tcp4_seq_show,
3044 .start = tcp_seq_start,
3045 .next = tcp_seq_next,
3046 .stop = tcp_seq_stop,
3047};
3048
1da177e4 3049static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4 3050 .family = AF_INET,
1da177e4
LT
3051};
3052
2c8c1e72 3053static int __net_init tcp4_proc_init_net(struct net *net)
757764f6 3054{
c3506372
CH
3055 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3056 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
37d849bb
CH
3057 return -ENOMEM;
3058 return 0;
757764f6
PE
3059}
3060
2c8c1e72 3061static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6 3062{
37d849bb 3063 remove_proc_entry("tcp", net->proc_net);
757764f6
PE
3064}
3065
3066static struct pernet_operations tcp4_net_ops = {
3067 .init = tcp4_proc_init_net,
3068 .exit = tcp4_proc_exit_net,
3069};
3070
1da177e4
LT
3071int __init tcp4_proc_init(void)
3072{
757764f6 3073 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
3074}
3075
3076void tcp4_proc_exit(void)
3077{
757764f6 3078 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
3079}
3080#endif /* CONFIG_PROC_FS */
3081
d3cd4924
ED
3082/* @wake is one when sk_stream_write_space() calls us.
3083 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3084 * This mimics the strategy used in sock_def_write_space().
3085 */
3086bool tcp_stream_memory_free(const struct sock *sk, int wake)
3087{
3088 const struct tcp_sock *tp = tcp_sk(sk);
3089 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3090 READ_ONCE(tp->snd_nxt);
3091
3092 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3093}
3094EXPORT_SYMBOL(tcp_stream_memory_free);
3095
1da177e4
LT
3096struct proto tcp_prot = {
3097 .name = "TCP",
3098 .owner = THIS_MODULE,
3099 .close = tcp_close,
d74bad4e 3100 .pre_connect = tcp_v4_pre_connect,
1da177e4
LT
3101 .connect = tcp_v4_connect,
3102 .disconnect = tcp_disconnect,
463c84b9 3103 .accept = inet_csk_accept,
1da177e4
LT
3104 .ioctl = tcp_ioctl,
3105 .init = tcp_v4_init_sock,
3106 .destroy = tcp_v4_destroy_sock,
3107 .shutdown = tcp_shutdown,
3108 .setsockopt = tcp_setsockopt,
3109 .getsockopt = tcp_getsockopt,
9cacf81f 3110 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
4b9d07a4 3111 .keepalive = tcp_set_keepalive,
1da177e4 3112 .recvmsg = tcp_recvmsg,
7ba42910
CG
3113 .sendmsg = tcp_sendmsg,
3114 .sendpage = tcp_sendpage,
1da177e4 3115 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 3116 .release_cb = tcp_release_cb,
ab1e0a13
ACM
3117 .hash = inet_hash,
3118 .unhash = inet_unhash,
3119 .get_port = inet_csk_get_port,
91a760b2 3120 .put_port = inet_put_port,
8a59f9d1
CW
3121#ifdef CONFIG_BPF_SYSCALL
3122 .psock_update_sk_prot = tcp_bpf_update_proto,
3123#endif
1da177e4 3124 .enter_memory_pressure = tcp_enter_memory_pressure,
06044751 3125 .leave_memory_pressure = tcp_leave_memory_pressure,
c9bee3b7 3126 .stream_memory_free = tcp_stream_memory_free,
1da177e4 3127 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 3128 .orphan_count = &tcp_orphan_count,
0defbb0a 3129
1da177e4 3130 .memory_allocated = &tcp_memory_allocated,
0defbb0a
ED
3131 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3132
1da177e4 3133 .memory_pressure = &tcp_memory_pressure,
a4fe34bf 3134 .sysctl_mem = sysctl_tcp_mem,
356d1833
ED
3135 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3136 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
1da177e4
LT
3137 .max_header = MAX_TCP_HEADER,
3138 .obj_size = sizeof(struct tcp_sock),
5f0d5a3a 3139 .slab_flags = SLAB_TYPESAFE_BY_RCU,
6d6ee43e 3140 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 3141 .rsk_prot = &tcp_request_sock_ops,
429e42c1 3142 .h.hashinfo = NULL,
7ba42910 3143 .no_autobind = true,
c1e64e29 3144 .diag_destroy = tcp_abort,
1da177e4 3145};
4bc2f18b 3146EXPORT_SYMBOL(tcp_prot);
1da177e4 3147
bdbbb852
ED
3148static void __net_exit tcp_sk_exit(struct net *net)
3149{
b506bc97 3150 if (net->ipv4.tcp_congestion_control)
0baf26b0
MKL
3151 bpf_module_put(net->ipv4.tcp_congestion_control,
3152 net->ipv4.tcp_congestion_control->owner);
bdbbb852
ED
3153}
3154
d1e5e640 3155static void __net_init tcp_set_hashinfo(struct net *net)
046ee902 3156{
d1e5e640
KI
3157 struct inet_hashinfo *hinfo;
3158 unsigned int ehash_entries;
3159 struct net *old_net;
3160
3161 if (net_eq(net, &init_net))
3162 goto fallback;
3163
3164 old_net = current->nsproxy->net_ns;
3165 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3166 if (!ehash_entries)
3167 goto fallback;
3168
3169 ehash_entries = roundup_pow_of_two(ehash_entries);
3170 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3171 if (!hinfo) {
3172 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3173 "for a netns, fallback to the global one\n",
3174 ehash_entries);
3175fallback:
3176 hinfo = &tcp_hashinfo;
3177 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3178 }
3179
3180 net->ipv4.tcp_death_row.hashinfo = hinfo;
3181 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3182 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3183}
49213555 3184
d1e5e640
KI
3185static int __net_init tcp_sk_init(struct net *net)
3186{
5d134f1c 3187 net->ipv4.sysctl_tcp_ecn = 2;
49213555
DB
3188 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3189
b0f9ca53 3190 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
5f3e2bf0 3191 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
6b58e0a5 3192 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
05cbc0db 3193 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
c04b79b6 3194 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
046ee902 3195
13b287e8 3196 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
9bd6861b 3197 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
b840d15d 3198 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
13b287e8 3199
6fa25166 3200 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
7c083ecb 3201 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
0aca737d 3202 net->ipv4.sysctl_tcp_syncookies = 1;
1043e25f 3203 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
ae5c3f40 3204 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
c6214a97 3205 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
c402d9be 3206 net->ipv4.sysctl_tcp_orphan_retries = 0;
1e579caa 3207 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
4979f2d9 3208 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
79e9fed4 3209 net->ipv4.sysctl_tcp_tw_reuse = 2;
65e6d901 3210 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
12ed8244 3211
e9bd0cca 3212 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
d1e5e640 3213 tcp_set_hashinfo(net);
1946e672 3214
f9301034 3215 net->ipv4.sysctl_tcp_sack = 1;
9bb37ef0 3216 net->ipv4.sysctl_tcp_window_scaling = 1;
5d2ed052 3217 net->ipv4.sysctl_tcp_timestamps = 1;
2ae21cf5 3218 net->ipv4.sysctl_tcp_early_retrans = 3;
e20223f1 3219 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
b510f0d2 3220 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
e0a1e5b5 3221 net->ipv4.sysctl_tcp_retrans_collapse = 1;
c6e21803 3222 net->ipv4.sysctl_tcp_max_reordering = 300;
6496f6bd 3223 net->ipv4.sysctl_tcp_dsack = 1;
0c12654a 3224 net->ipv4.sysctl_tcp_app_win = 31;
94f0893e 3225 net->ipv4.sysctl_tcp_adv_win_scale = 1;
af9b69a7 3226 net->ipv4.sysctl_tcp_frto = 2;
4540c0cf 3227 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
d06a9904
ED
3228 /* This limits the percentage of the congestion window which we
3229 * will allow a single TSO frame to consume. Building TSO frames
3230 * which are too large can cause TCP streams to be bursty.
3231 */
3232 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
c73e5807
ED
3233 /* Default TSQ limit of 16 TSO segments */
3234 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
79e3602c
ED
3235
3236 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3237 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3238
26e9596e 3239 net->ipv4.sysctl_tcp_min_tso_segs = 2;
65466904 3240 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
bd239704 3241 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
790f00e1 3242 net->ipv4.sysctl_tcp_autocorking = 1;
4170ba6b 3243 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
23a7102a 3244 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
c26e91f8 3245 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
356d1833
ED
3246 if (net != &init_net) {
3247 memcpy(net->ipv4.sysctl_tcp_rmem,
3248 init_net.ipv4.sysctl_tcp_rmem,
3249 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3250 memcpy(net->ipv4.sysctl_tcp_wmem,
3251 init_net.ipv4.sysctl_tcp_wmem,
3252 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3253 }
6d82aa24 3254 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
a70437cc 3255 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
9c21d2fc 3256 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
e1cfcbe8 3257 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
213ad73d 3258 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3733be14 3259 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
e1cfcbe8 3260
bd456f28
MAQ
3261 /* Set default values for PLB */
3262 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3263 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3264 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3265 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3266 /* Default congestion threshold for PLB to mark a round is 50% */
1a91bb7c 3267 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
bd456f28 3268
6670e152
SH
3269 /* Reno is always built in */
3270 if (!net_eq(net, &init_net) &&
0baf26b0
MKL
3271 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3272 init_net.ipv4.tcp_congestion_control->owner))
6670e152
SH
3273 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3274 else
3275 net->ipv4.tcp_congestion_control = &tcp_reno;
3276
49213555 3277 return 0;
b099ce26
EB
3278}
3279
3280static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3281{
43713848
HY
3282 struct net *net;
3283
edc12f03 3284 tcp_twsk_purge(net_exit_list, AF_INET);
04c494e6 3285
e9bd0cca 3286 list_for_each_entry(net, net_exit_list, exit_list) {
d1e5e640 3287 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
e9bd0cca 3288 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
43713848 3289 tcp_fastopen_ctx_destroy(net);
e9bd0cca 3290 }
046ee902
DL
3291}
3292
3293static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
3294 .init = tcp_sk_init,
3295 .exit = tcp_sk_exit,
3296 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
3297};
3298
52d87d5f
YS
3299#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3300DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3301 struct sock_common *sk_common, uid_t uid)
3302
04c7820b
MKL
3303#define INIT_BATCH_SZ 16
3304
f9c79272 3305static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
52d87d5f 3306{
04c7820b
MKL
3307 struct bpf_tcp_iter_state *iter = priv_data;
3308 int err;
52d87d5f 3309
04c7820b
MKL
3310 err = bpf_iter_init_seq_net(priv_data, aux);
3311 if (err)
3312 return err;
52d87d5f 3313
04c7820b
MKL
3314 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3315 if (err) {
3316 bpf_iter_fini_seq_net(priv_data);
3317 return err;
3318 }
3319
3320 return 0;
52d87d5f
YS
3321}
3322
3323static void bpf_iter_fini_tcp(void *priv_data)
3324{
04c7820b 3325 struct bpf_tcp_iter_state *iter = priv_data;
52d87d5f 3326
52d87d5f 3327 bpf_iter_fini_seq_net(priv_data);
04c7820b 3328 kvfree(iter->batch);
52d87d5f
YS
3329}
3330
14fc6bd6 3331static const struct bpf_iter_seq_info tcp_seq_info = {
52d87d5f
YS
3332 .seq_ops = &bpf_iter_tcp_seq_ops,
3333 .init_seq_private = bpf_iter_init_tcp,
3334 .fini_seq_private = bpf_iter_fini_tcp,
04c7820b 3335 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
14fc6bd6
YS
3336};
3337
3cee6fb8
MKL
3338static const struct bpf_func_proto *
3339bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3340 const struct bpf_prog *prog)
3341{
3342 switch (func_id) {
3343 case BPF_FUNC_setsockopt:
3344 return &bpf_sk_setsockopt_proto;
3345 case BPF_FUNC_getsockopt:
3346 return &bpf_sk_getsockopt_proto;
3347 default:
3348 return NULL;
3349 }
3350}
3351
14fc6bd6
YS
3352static struct bpf_iter_reg tcp_reg_info = {
3353 .target = "tcp",
52d87d5f
YS
3354 .ctx_arg_info_size = 1,
3355 .ctx_arg_info = {
3356 { offsetof(struct bpf_iter__tcp, sk_common),
3357 PTR_TO_BTF_ID_OR_NULL },
3358 },
3cee6fb8 3359 .get_func_proto = bpf_iter_tcp_get_func_proto,
14fc6bd6 3360 .seq_info = &tcp_seq_info,
52d87d5f
YS
3361};
3362
3363static void __init bpf_iter_register(void)
3364{
951cf368 3365 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
52d87d5f
YS
3366 if (bpf_iter_reg_target(&tcp_reg_info))
3367 pr_warn("Warning: could not register bpf iterator tcp\n");
3368}
3369
3370#endif
3371
9b0f976f 3372void __init tcp_v4_init(void)
1da177e4 3373{
37ba017d
ED
3374 int cpu, res;
3375
3376 for_each_possible_cpu(cpu) {
3377 struct sock *sk;
3378
3379 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3380 IPPROTO_TCP, &init_net);
3381 if (res)
3382 panic("Failed to create the TCP control socket.\n");
3383 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3384
3385 /* Please enforce IP_DF and IPID==0 for RST and
3386 * ACK sent in SYN-RECV and TIME-WAIT state.
3387 */
3388 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3389
3390 per_cpu(ipv4_tcp_sk, cpu) = sk;
3391 }
6a1b3054 3392 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 3393 panic("Failed to create the TCP control socket.\n");
52d87d5f
YS
3394
3395#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3396 bpf_iter_register();
3397#endif
1da177e4 3398}