net: annotate lockless accesses to sk->sk_ack_backlog
[linux-block.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
1da177e4
LT
9 * IPv4 specific functions
10 *
1da177e4
LT
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
1da177e4
LT
17 */
18
19/*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
60236fdd 32 * request_sock handling and moved
1da177e4
LT
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
caa20d9a 35 * Added new listen semantics.
1da177e4
LT
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
afd46503 48#define pr_fmt(fmt) "TCP: " fmt
1da177e4 49
eb4dea58 50#include <linux/bottom_half.h>
1da177e4
LT
51#include <linux/types.h>
52#include <linux/fcntl.h>
53#include <linux/module.h>
54#include <linux/random.h>
55#include <linux/cache.h>
56#include <linux/jhash.h>
57#include <linux/init.h>
58#include <linux/times.h>
5a0e3ad6 59#include <linux/slab.h>
1da177e4 60
457c4cbc 61#include <net/net_namespace.h>
1da177e4 62#include <net/icmp.h>
304a1618 63#include <net/inet_hashtables.h>
1da177e4 64#include <net/tcp.h>
20380731 65#include <net/transp_v6.h>
1da177e4
LT
66#include <net/ipv6.h>
67#include <net/inet_common.h>
6d6ee43e 68#include <net/timewait_sock.h>
1da177e4 69#include <net/xfrm.h>
6e5714ea 70#include <net/secure_seq.h>
076bb0c8 71#include <net/busy_poll.h>
1da177e4
LT
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
6797318e 78#include <linux/inetdevice.h>
1da177e4 79
cf80e0e4 80#include <crypto/hash.h>
cfb6eeb4
YH
81#include <linux/scatterlist.h>
82
c24b14c4
SL
83#include <trace/events/tcp.h>
84
cfb6eeb4 85#ifdef CONFIG_TCP_MD5SIG
a915da9b 86static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 87 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
88#endif
89
5caea4ea 90struct inet_hashinfo tcp_hashinfo;
4bc2f18b 91EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 92
84b114b9 93static u32 tcp_v4_init_seq(const struct sk_buff *skb)
1da177e4 94{
84b114b9
ED
95 return secure_tcp_seq(ip_hdr(skb)->daddr,
96 ip_hdr(skb)->saddr,
97 tcp_hdr(skb)->dest,
98 tcp_hdr(skb)->source);
99}
100
5d2ed052 101static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
84b114b9 102{
5d2ed052 103 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
1da177e4
LT
104}
105
6d6ee43e
ACM
106int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
107{
79e9fed4 108 const struct inet_timewait_sock *tw = inet_twsk(sktw);
6d6ee43e
ACM
109 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 struct tcp_sock *tp = tcp_sk(sk);
79e9fed4
111 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
112
113 if (reuse == 2) {
114 /* Still does not detect *everything* that goes through
115 * lo, since we require a loopback src or dst address
116 * or direct binding to 'lo' interface.
117 */
118 bool loopback = false;
119 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
120 loopback = true;
121#if IS_ENABLED(CONFIG_IPV6)
122 if (tw->tw_family == AF_INET6) {
123 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
be2644aa 124 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
79e9fed4 125 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
be2644aa 126 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
79e9fed4
127 loopback = true;
128 } else
129#endif
130 {
131 if (ipv4_is_loopback(tw->tw_daddr) ||
132 ipv4_is_loopback(tw->tw_rcv_saddr))
133 loopback = true;
134 }
135 if (!loopback)
136 reuse = 0;
137 }
6d6ee43e
ACM
138
139 /* With PAWS, it is safe from the viewpoint
140 of data integrity. Even without PAWS it is safe provided sequence
141 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142
143 Actually, the idea is close to VJ's one, only timestamp cache is
144 held not per host, but per port pair and TW bucket is used as state
145 holder.
146
147 If TW bucket has been already destroyed we fall back to VJ's scheme
148 and use initial timestamp retrieved from peer table.
149 */
150 if (tcptw->tw_ts_recent_stamp &&
cca9bab1
AB
151 (!twp || (reuse && time_after32(ktime_get_seconds(),
152 tcptw->tw_ts_recent_stamp)))) {
21684dc4
SB
153 /* In case of repair and re-using TIME-WAIT sockets we still
154 * want to be sure that it is safe as above but honor the
155 * sequence numbers and time stamps set as part of the repair
156 * process.
157 *
158 * Without this check re-using a TIME-WAIT socket with TCP
159 * repair would accumulate a -1 on the repair assigned
160 * sequence number. The first time it is reused the sequence
161 * is -1, the second time -2, etc. This fixes that issue
162 * without appearing to create any others.
163 */
164 if (likely(!tp->repair)) {
0f317464
ED
165 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
166
167 if (!seq)
168 seq = 1;
169 WRITE_ONCE(tp->write_seq, seq);
21684dc4
SB
170 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
171 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
172 }
6d6ee43e
ACM
173 sock_hold(sktw);
174 return 1;
175 }
176
177 return 0;
178}
6d6ee43e
ACM
179EXPORT_SYMBOL_GPL(tcp_twsk_unique);
180
d74bad4e
AI
181static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
182 int addr_len)
183{
184 /* This check is replicated from tcp_v4_connect() and intended to
185 * prevent BPF program called below from accessing bytes that are out
186 * of the bound specified by user in addr_len.
187 */
188 if (addr_len < sizeof(struct sockaddr_in))
189 return -EINVAL;
190
191 sock_owned_by_me(sk);
192
193 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
194}
195
1da177e4
LT
196/* This will initiate an outgoing connection. */
197int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
198{
2d7192d6 199 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
200 struct inet_sock *inet = inet_sk(sk);
201 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 202 __be16 orig_sport, orig_dport;
bada8adc 203 __be32 daddr, nexthop;
da905bd1 204 struct flowi4 *fl4;
2d7192d6 205 struct rtable *rt;
1da177e4 206 int err;
f6d8bd05 207 struct ip_options_rcu *inet_opt;
1946e672 208 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
1da177e4
LT
209
210 if (addr_len < sizeof(struct sockaddr_in))
211 return -EINVAL;
212
213 if (usin->sin_family != AF_INET)
214 return -EAFNOSUPPORT;
215
216 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05 217 inet_opt = rcu_dereference_protected(inet->inet_opt,
1e1d04e6 218 lockdep_sock_is_held(sk));
f6d8bd05 219 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
220 if (!daddr)
221 return -EINVAL;
f6d8bd05 222 nexthop = inet_opt->opt.faddr;
1da177e4
LT
223 }
224
dca8b089
DM
225 orig_sport = inet->inet_sport;
226 orig_dport = usin->sin_port;
da905bd1
DM
227 fl4 = &inet->cork.fl.u.ip4;
228 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
229 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
230 IPPROTO_TCP,
0e0d44ab 231 orig_sport, orig_dport, sk);
b23dd4fe
DM
232 if (IS_ERR(rt)) {
233 err = PTR_ERR(rt);
234 if (err == -ENETUNREACH)
f1d8cba6 235 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 236 return err;
584bdf8c 237 }
1da177e4
LT
238
239 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
240 ip_rt_put(rt);
241 return -ENETUNREACH;
242 }
243
f6d8bd05 244 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 245 daddr = fl4->daddr;
1da177e4 246
c720c7e8 247 if (!inet->inet_saddr)
da905bd1 248 inet->inet_saddr = fl4->saddr;
d1e559d0 249 sk_rcv_saddr_set(sk, inet->inet_saddr);
1da177e4 250
c720c7e8 251 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
252 /* Reset inherited state */
253 tp->rx_opt.ts_recent = 0;
254 tp->rx_opt.ts_recent_stamp = 0;
ee995283 255 if (likely(!tp->repair))
0f317464 256 WRITE_ONCE(tp->write_seq, 0);
1da177e4
LT
257 }
258
c720c7e8 259 inet->inet_dport = usin->sin_port;
d1e559d0 260 sk_daddr_set(sk, daddr);
1da177e4 261
d83d8461 262 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
263 if (inet_opt)
264 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 265
bee7ca9e 266 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
267
268 /* Socket identity is still unknown (sport may be zero).
269 * However we set state to SYN-SENT and not releasing socket
270 * lock select source port, enter ourselves into the hash tables and
271 * complete initialization after this.
272 */
273 tcp_set_state(sk, TCP_SYN_SENT);
1946e672 274 err = inet_hash_connect(tcp_death_row, sk);
1da177e4
LT
275 if (err)
276 goto failure;
277
877d1f62 278 sk_set_txhash(sk);
9e7ceb06 279
da905bd1 280 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
281 inet->inet_sport, inet->inet_dport, sk);
282 if (IS_ERR(rt)) {
283 err = PTR_ERR(rt);
284 rt = NULL;
1da177e4 285 goto failure;
b23dd4fe 286 }
1da177e4 287 /* OK, now commit destination to socket. */
bcd76111 288 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 289 sk_setup_caps(sk, &rt->dst);
19f6d3f3 290 rt = NULL;
1da177e4 291
00355fa5 292 if (likely(!tp->repair)) {
00355fa5 293 if (!tp->write_seq)
0f317464
ED
294 WRITE_ONCE(tp->write_seq,
295 secure_tcp_seq(inet->inet_saddr,
296 inet->inet_daddr,
297 inet->inet_sport,
298 usin->sin_port));
5d2ed052
ED
299 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
300 inet->inet_saddr,
84b114b9 301 inet->inet_daddr);
00355fa5 302 }
1da177e4 303
a904a069 304 inet->inet_id = prandom_u32();
1da177e4 305
19f6d3f3
WW
306 if (tcp_fastopen_defer_connect(sk, &err))
307 return err;
308 if (err)
309 goto failure;
310
2b916477 311 err = tcp_connect(sk);
ee995283 312
1da177e4
LT
313 if (err)
314 goto failure;
315
316 return 0;
317
318failure:
7174259e
ACM
319 /*
320 * This unhashes the socket and releases the local port,
321 * if necessary.
322 */
1da177e4
LT
323 tcp_set_state(sk, TCP_CLOSE);
324 ip_rt_put(rt);
325 sk->sk_route_caps = 0;
c720c7e8 326 inet->inet_dport = 0;
1da177e4
LT
327 return err;
328}
4bc2f18b 329EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 330
1da177e4 331/*
563d34d0
ED
332 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
333 * It can be called through tcp_release_cb() if socket was owned by user
334 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 335 */
4fab9071 336void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4 337{
1da177e4 338 struct inet_sock *inet = inet_sk(sk);
02b2faaf
ED
339 struct dst_entry *dst;
340 u32 mtu;
1da177e4 341
02b2faaf
ED
342 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
343 return;
344 mtu = tcp_sk(sk)->mtu_info;
80d0a69f
DM
345 dst = inet_csk_update_pmtu(sk, mtu);
346 if (!dst)
1da177e4
LT
347 return;
348
1da177e4
LT
349 /* Something is about to be wrong... Remember soft error
350 * for the case, if this connection will not able to recover.
351 */
352 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
353 sk->sk_err_soft = EMSGSIZE;
354
355 mtu = dst_mtu(dst);
356
357 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
482fc609 358 ip_sk_accept_pmtu(sk) &&
d83d8461 359 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
360 tcp_sync_mss(sk, mtu);
361
362 /* Resend the TCP packet because it's
363 * clear that the old packet has been
364 * dropped. This is the new "fast" path mtu
365 * discovery.
366 */
367 tcp_simple_retransmit(sk);
368 } /* else let the usual retransmit timer handle it */
369}
4fab9071 370EXPORT_SYMBOL(tcp_v4_mtu_reduced);
1da177e4 371
55be7a9c
DM
372static void do_redirect(struct sk_buff *skb, struct sock *sk)
373{
374 struct dst_entry *dst = __sk_dst_check(sk, 0);
375
1ed5c48f 376 if (dst)
6700c270 377 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
378}
379
26e37360
ED
380
381/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
9cf74903 382void tcp_req_err(struct sock *sk, u32 seq, bool abort)
26e37360
ED
383{
384 struct request_sock *req = inet_reqsk(sk);
385 struct net *net = sock_net(sk);
386
387 /* ICMPs are not backlogged, hence we cannot get
388 * an established socket here.
389 */
26e37360 390 if (seq != tcp_rsk(req)->snt_isn) {
02a1d6e7 391 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
9cf74903 392 } else if (abort) {
26e37360
ED
393 /*
394 * Still in SYN_RECV, just remove it silently.
395 * There is no good way to pass the error to the newly
396 * created socket, and POSIX does not want network
397 * errors returned from accept().
398 */
c6973669 399 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
9caad864 400 tcp_listendrop(req->rsk_listener);
26e37360 401 }
ef84d8ce 402 reqsk_put(req);
26e37360
ED
403}
404EXPORT_SYMBOL(tcp_req_err);
405
1da177e4
LT
406/*
407 * This routine is called by the ICMP module when it gets some
408 * sort of error condition. If err < 0 then the socket should
409 * be closed and the error returned to the user. If err > 0
410 * it's just the icmp type << 8 | icmp code. After adjustment
411 * header points to the first 8 bytes of the tcp header. We need
412 * to find the appropriate port.
413 *
414 * The locking strategy used here is very "optimistic". When
415 * someone else accesses the socket the ICMP is just dropped
416 * and for some paths there is no check at all.
417 * A more general error queue to queue errors for later handling
418 * is probably better.
419 *
420 */
421
32bbd879 422int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 423{
b71d1d42 424 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 425 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 426 struct inet_connection_sock *icsk;
1da177e4
LT
427 struct tcp_sock *tp;
428 struct inet_sock *inet;
4d1a2d9e
DL
429 const int type = icmp_hdr(icmp_skb)->type;
430 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 431 struct sock *sk;
f1ecd5d9 432 struct sk_buff *skb;
0a672f74 433 struct request_sock *fastopen;
9a568de4
ED
434 u32 seq, snd_una;
435 s32 remaining;
436 u32 delta_us;
1da177e4 437 int err;
4d1a2d9e 438 struct net *net = dev_net(icmp_skb->dev);
1da177e4 439
26e37360
ED
440 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
441 th->dest, iph->saddr, ntohs(th->source),
3fa6f616 442 inet_iif(icmp_skb), 0);
1da177e4 443 if (!sk) {
5d3848bc 444 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
32bbd879 445 return -ENOENT;
1da177e4
LT
446 }
447 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 448 inet_twsk_put(inet_twsk(sk));
32bbd879 449 return 0;
1da177e4 450 }
26e37360 451 seq = ntohl(th->seq);
32bbd879
SB
452 if (sk->sk_state == TCP_NEW_SYN_RECV) {
453 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
454 type == ICMP_TIME_EXCEEDED ||
455 (type == ICMP_DEST_UNREACH &&
456 (code == ICMP_NET_UNREACH ||
457 code == ICMP_HOST_UNREACH)));
458 return 0;
459 }
1da177e4
LT
460
461 bh_lock_sock(sk);
462 /* If too many ICMPs get dropped on busy
463 * servers this needs to be solved differently.
563d34d0
ED
464 * We do take care of PMTU discovery (RFC1191) special case :
465 * we can receive locally generated ICMP messages while socket is held.
1da177e4 466 */
b74aa930
ED
467 if (sock_owned_by_user(sk)) {
468 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
02a1d6e7 469 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
b74aa930 470 }
1da177e4
LT
471 if (sk->sk_state == TCP_CLOSE)
472 goto out;
473
97e3ecd1 474 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 475 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
97e3ecd1 476 goto out;
477 }
478
f1ecd5d9 479 icsk = inet_csk(sk);
1da177e4 480 tp = tcp_sk(sk);
0a672f74 481 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
d983ea6f 482 fastopen = rcu_dereference(tp->fastopen_rsk);
0a672f74 483 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
1da177e4 484 if (sk->sk_state != TCP_LISTEN &&
0a672f74 485 !between(seq, snd_una, tp->snd_nxt)) {
02a1d6e7 486 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
487 goto out;
488 }
489
490 switch (type) {
55be7a9c 491 case ICMP_REDIRECT:
45caeaa5
JM
492 if (!sock_owned_by_user(sk))
493 do_redirect(icmp_skb, sk);
55be7a9c 494 goto out;
1da177e4
LT
495 case ICMP_SOURCE_QUENCH:
496 /* Just silently ignore these. */
497 goto out;
498 case ICMP_PARAMETERPROB:
499 err = EPROTO;
500 break;
501 case ICMP_DEST_UNREACH:
502 if (code > NR_ICMP_UNREACH)
503 goto out;
504
505 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
506 /* We are not interested in TCP_LISTEN and open_requests
507 * (SYN-ACKs send out by Linux are always <576bytes so
508 * they should go through unfragmented).
509 */
510 if (sk->sk_state == TCP_LISTEN)
511 goto out;
512
563d34d0 513 tp->mtu_info = info;
144d56e9 514 if (!sock_owned_by_user(sk)) {
563d34d0 515 tcp_v4_mtu_reduced(sk);
144d56e9 516 } else {
7aa5470c 517 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
144d56e9
ED
518 sock_hold(sk);
519 }
1da177e4
LT
520 goto out;
521 }
522
523 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
524 /* check if icmp_skb allows revert of backoff
525 * (see draft-zimmermann-tcp-lcd) */
526 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
527 break;
528 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
0a672f74 529 !icsk->icsk_backoff || fastopen)
f1ecd5d9
DL
530 break;
531
8f49c270
DM
532 if (sock_owned_by_user(sk))
533 break;
534
2c4cc971
ED
535 skb = tcp_rtx_queue_head(sk);
536 if (WARN_ON_ONCE(!skb))
537 break;
538
f1ecd5d9 539 icsk->icsk_backoff--;
fcdd1cf4
ED
540 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
541 TCP_TIMEOUT_INIT;
542 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
f1ecd5d9 543
f1ecd5d9 544
9a568de4 545 tcp_mstamp_refresh(tp);
2fd66ffb 546 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
7faee5c0 547 remaining = icsk->icsk_rto -
9a568de4 548 usecs_to_jiffies(delta_us);
f1ecd5d9 549
9a568de4 550 if (remaining > 0) {
f1ecd5d9
DL
551 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
552 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
553 } else {
554 /* RTO revert clocked out retransmission.
555 * Will retransmit now */
556 tcp_retransmit_timer(sk);
557 }
558
1da177e4
LT
559 break;
560 case ICMP_TIME_EXCEEDED:
561 err = EHOSTUNREACH;
562 break;
563 default:
564 goto out;
565 }
566
567 switch (sk->sk_state) {
1da177e4 568 case TCP_SYN_SENT:
0a672f74
YC
569 case TCP_SYN_RECV:
570 /* Only in fast or simultaneous open. If a fast open socket is
571 * is already accepted it is treated as a connected one below.
572 */
51456b29 573 if (fastopen && !fastopen->sk)
0a672f74
YC
574 break;
575
1da177e4 576 if (!sock_owned_by_user(sk)) {
1da177e4
LT
577 sk->sk_err = err;
578
579 sk->sk_error_report(sk);
580
581 tcp_done(sk);
582 } else {
583 sk->sk_err_soft = err;
584 }
585 goto out;
586 }
587
588 /* If we've already connected we will keep trying
589 * until we time out, or the user gives up.
590 *
591 * rfc1122 4.2.3.9 allows to consider as hard errors
592 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
593 * but it is obsoleted by pmtu discovery).
594 *
595 * Note, that in modern internet, where routing is unreliable
596 * and in each dark corner broken firewalls sit, sending random
597 * errors ordered by their masters even this two messages finally lose
598 * their original sense (even Linux sends invalid PORT_UNREACHs)
599 *
600 * Now we are in compliance with RFCs.
601 * --ANK (980905)
602 */
603
604 inet = inet_sk(sk);
605 if (!sock_owned_by_user(sk) && inet->recverr) {
606 sk->sk_err = err;
607 sk->sk_error_report(sk);
608 } else { /* Only an error on timeout */
609 sk->sk_err_soft = err;
610 }
611
612out:
613 bh_unlock_sock(sk);
614 sock_put(sk);
32bbd879 615 return 0;
1da177e4
LT
616}
617
28850dc7 618void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 619{
aa8223c7 620 struct tcphdr *th = tcp_hdr(skb);
1da177e4 621
98be9b12
ED
622 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
623 skb->csum_start = skb_transport_header(skb) - skb->head;
624 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4
LT
625}
626
419f9f89 627/* This routine computes an IPv4 TCP checksum. */
bb296246 628void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 629{
cf533ea5 630 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
631
632 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
633}
4bc2f18b 634EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 635
1da177e4
LT
636/*
637 * This routine will send an RST to the other tcp.
638 *
639 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
640 * for reset.
641 * Answer: if a packet caused RST, it is not for a socket
642 * existing in our system, if it is matched to a socket,
643 * it is just duplicate segment or bug in other side's TCP.
644 * So that we build reply only basing on parameters
645 * arrived with segment.
646 * Exception: precedence violation. We do not implement it in any case.
647 */
648
a00e7444 649static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
1da177e4 650{
cf533ea5 651 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
652 struct {
653 struct tcphdr th;
654#ifdef CONFIG_TCP_MD5SIG
714e85be 655 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
656#endif
657 } rep;
1da177e4 658 struct ip_reply_arg arg;
cfb6eeb4 659#ifdef CONFIG_TCP_MD5SIG
e46787f0 660 struct tcp_md5sig_key *key = NULL;
658ddaaf
SL
661 const __u8 *hash_location = NULL;
662 unsigned char newhash[16];
663 int genhash;
664 struct sock *sk1 = NULL;
cfb6eeb4 665#endif
d6fb396c 666 u64 transmit_time = 0;
00483690 667 struct sock *ctl_sk;
d6fb396c 668 struct net *net;
1da177e4
LT
669
670 /* Never send a reset in response to a reset. */
671 if (th->rst)
672 return;
673
c3658e8d
ED
674 /* If sk not NULL, it means we did a successful lookup and incoming
675 * route had to be correct. prequeue might have dropped our dst.
676 */
677 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
678 return;
679
680 /* Swap the send and the receive. */
cfb6eeb4
YH
681 memset(&rep, 0, sizeof(rep));
682 rep.th.dest = th->source;
683 rep.th.source = th->dest;
684 rep.th.doff = sizeof(struct tcphdr) / 4;
685 rep.th.rst = 1;
1da177e4
LT
686
687 if (th->ack) {
cfb6eeb4 688 rep.th.seq = th->ack_seq;
1da177e4 689 } else {
cfb6eeb4
YH
690 rep.th.ack = 1;
691 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
692 skb->len - (th->doff << 2));
1da177e4
LT
693 }
694
7174259e 695 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
696 arg.iov[0].iov_base = (unsigned char *)&rep;
697 arg.iov[0].iov_len = sizeof(rep.th);
698
0f85feae 699 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
cfb6eeb4 700#ifdef CONFIG_TCP_MD5SIG
3b24d854 701 rcu_read_lock();
658ddaaf 702 hash_location = tcp_parse_md5sig_option(th);
271c3b9b 703 if (sk && sk_fullsock(sk)) {
e46787f0
FW
704 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
705 &ip_hdr(skb)->saddr, AF_INET);
706 } else if (hash_location) {
658ddaaf
SL
707 /*
708 * active side is lost. Try to find listening socket through
709 * source port, and then find md5 key through listening socket.
710 * we are not loose security here:
711 * Incoming packet is checked with md5 hash with finding key,
712 * no RST generated if md5 hash doesn't match.
713 */
a583636a
CG
714 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
715 ip_hdr(skb)->saddr,
da5e3630 716 th->source, ip_hdr(skb)->daddr,
3fa6f616
DA
717 ntohs(th->source), inet_iif(skb),
718 tcp_v4_sdif(skb));
658ddaaf
SL
719 /* don't send rst if it can't find key */
720 if (!sk1)
3b24d854
ED
721 goto out;
722
658ddaaf
SL
723 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
724 &ip_hdr(skb)->saddr, AF_INET);
725 if (!key)
3b24d854
ED
726 goto out;
727
658ddaaf 728
39f8e58e 729 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
658ddaaf 730 if (genhash || memcmp(hash_location, newhash, 16) != 0)
3b24d854
ED
731 goto out;
732
658ddaaf
SL
733 }
734
cfb6eeb4
YH
735 if (key) {
736 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
737 (TCPOPT_NOP << 16) |
738 (TCPOPT_MD5SIG << 8) |
739 TCPOLEN_MD5SIG);
740 /* Update length and the length the header thinks exists */
741 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
742 rep.th.doff = arg.iov[0].iov_len / 4;
743
49a72dfb 744 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
745 key, ip_hdr(skb)->saddr,
746 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
747 }
748#endif
eddc9ec5
ACM
749 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
750 ip_hdr(skb)->saddr, /* XXX */
52cd5750 751 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 752 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
271c3b9b
FW
753 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
754
e2446eaa 755 /* When socket is gone, all binding information is lost.
4c675258
AK
756 * routing might fail in this case. No choice here, if we choose to force
757 * input interface, we will misroute in case of asymmetric route.
e2446eaa 758 */
c24b14c4 759 if (sk) {
4c675258 760 arg.bound_dev_if = sk->sk_bound_dev_if;
5c487bb9
SL
761 if (sk_fullsock(sk))
762 trace_tcp_send_reset(sk, skb);
c24b14c4 763 }
1da177e4 764
271c3b9b
FW
765 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
766 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
767
66b13d99 768 arg.tos = ip_hdr(skb)->tos;
e2d118a1 769 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
47dcc20a 770 local_bh_disable();
5472c3c6 771 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
a842fe14 772 if (sk) {
00483690
JM
773 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
774 inet_twsk(sk)->tw_mark : sk->sk_mark;
f6c0f5d2
ED
775 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
776 inet_twsk(sk)->tw_priority : sk->sk_priority;
d6fb396c 777 transmit_time = tcp_transmit_time(sk);
a842fe14 778 }
00483690 779 ip_send_unicast_reply(ctl_sk,
bdbbb852 780 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d 781 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
d6fb396c
ED
782 &arg, arg.iov[0].iov_len,
783 transmit_time);
1da177e4 784
00483690 785 ctl_sk->sk_mark = 0;
90bbcc60
ED
786 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
787 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
47dcc20a 788 local_bh_enable();
658ddaaf
SL
789
790#ifdef CONFIG_TCP_MD5SIG
3b24d854
ED
791out:
792 rcu_read_unlock();
658ddaaf 793#endif
1da177e4
LT
794}
795
796/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
797 outside socket context is ugly, certainly. What can I do?
798 */
799
e2d118a1 800static void tcp_v4_send_ack(const struct sock *sk,
e62a123b 801 struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 802 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 803 struct tcp_md5sig_key *key,
66b13d99 804 int reply_flags, u8 tos)
1da177e4 805{
cf533ea5 806 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
807 struct {
808 struct tcphdr th;
714e85be 809 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 810#ifdef CONFIG_TCP_MD5SIG
714e85be 811 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
812#endif
813 ];
1da177e4 814 } rep;
e2d118a1 815 struct net *net = sock_net(sk);
1da177e4 816 struct ip_reply_arg arg;
00483690 817 struct sock *ctl_sk;
d6fb396c 818 u64 transmit_time;
1da177e4
LT
819
820 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 821 memset(&arg, 0, sizeof(arg));
1da177e4
LT
822
823 arg.iov[0].iov_base = (unsigned char *)&rep;
824 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 825 if (tsecr) {
cfb6eeb4
YH
826 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
827 (TCPOPT_TIMESTAMP << 8) |
828 TCPOLEN_TIMESTAMP);
ee684b6f
AV
829 rep.opt[1] = htonl(tsval);
830 rep.opt[2] = htonl(tsecr);
cb48cfe8 831 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
832 }
833
834 /* Swap the send and the receive. */
835 rep.th.dest = th->source;
836 rep.th.source = th->dest;
837 rep.th.doff = arg.iov[0].iov_len / 4;
838 rep.th.seq = htonl(seq);
839 rep.th.ack_seq = htonl(ack);
840 rep.th.ack = 1;
841 rep.th.window = htons(win);
842
cfb6eeb4 843#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 844 if (key) {
ee684b6f 845 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
846
847 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
848 (TCPOPT_NOP << 16) |
849 (TCPOPT_MD5SIG << 8) |
850 TCPOLEN_MD5SIG);
851 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
852 rep.th.doff = arg.iov[0].iov_len/4;
853
49a72dfb 854 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
855 key, ip_hdr(skb)->saddr,
856 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
857 }
858#endif
88ef4a5a 859 arg.flags = reply_flags;
eddc9ec5
ACM
860 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
861 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
862 arg.iov[0].iov_len, IPPROTO_TCP, 0);
863 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
864 if (oif)
865 arg.bound_dev_if = oif;
66b13d99 866 arg.tos = tos;
e2d118a1 867 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
47dcc20a 868 local_bh_disable();
5472c3c6 869 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
a842fe14
ED
870 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
871 inet_twsk(sk)->tw_mark : sk->sk_mark;
f6c0f5d2
ED
872 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
873 inet_twsk(sk)->tw_priority : sk->sk_priority;
d6fb396c 874 transmit_time = tcp_transmit_time(sk);
00483690 875 ip_send_unicast_reply(ctl_sk,
bdbbb852 876 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d 877 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
d6fb396c
ED
878 &arg, arg.iov[0].iov_len,
879 transmit_time);
1da177e4 880
00483690 881 ctl_sk->sk_mark = 0;
90bbcc60 882 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
47dcc20a 883 local_bh_enable();
1da177e4
LT
884}
885
886static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
887{
8feaf0c0 888 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 889 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 890
e2d118a1 891 tcp_v4_send_ack(sk, skb,
e62a123b 892 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 893 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
9a568de4 894 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
9501f972
YH
895 tcptw->tw_ts_recent,
896 tw->tw_bound_dev_if,
88ef4a5a 897 tcp_twsk_md5_key(tcptw),
66b13d99
ED
898 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
899 tw->tw_tos
9501f972 900 );
1da177e4 901
8feaf0c0 902 inet_twsk_put(tw);
1da177e4
LT
903}
904
a00e7444 905static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
7174259e 906 struct request_sock *req)
1da177e4 907{
168a8f58
JC
908 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
909 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
910 */
e62a123b
ED
911 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
912 tcp_sk(sk)->snd_nxt;
913
20a2b49f
ED
914 /* RFC 7323 2.3
915 * The window field (SEG.WND) of every outgoing segment, with the
916 * exception of <SYN> segments, MUST be right-shifted by
917 * Rcv.Wind.Shift bits:
918 */
e2d118a1 919 tcp_v4_send_ack(sk, skb, seq,
20a2b49f
ED
920 tcp_rsk(req)->rcv_nxt,
921 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
9a568de4 922 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
9501f972
YH
923 req->ts_recent,
924 0,
30791ac4 925 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
a915da9b 926 AF_INET),
66b13d99
ED
927 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
928 ip_hdr(skb)->tos);
1da177e4
LT
929}
930
1da177e4 931/*
9bf1d83e 932 * Send a SYN-ACK after having received a SYN.
60236fdd 933 * This still operates on a request_sock only, not on a big
1da177e4
LT
934 * socket.
935 */
0f935dbe 936static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
d6274bd8 937 struct flowi *fl,
72659ecc 938 struct request_sock *req,
ca6fb065 939 struct tcp_fastopen_cookie *foc,
b3d05147 940 enum tcp_synack_type synack_type)
1da177e4 941{
2e6599cb 942 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 943 struct flowi4 fl4;
1da177e4 944 int err = -1;
d41db5af 945 struct sk_buff *skb;
1da177e4
LT
946
947 /* First, grab a route. */
ba3f7f04 948 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 949 return -1;
1da177e4 950
b3d05147 951 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
1da177e4
LT
952
953 if (skb) {
634fb979 954 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1da177e4 955
2ab2ddd3 956 rcu_read_lock();
634fb979
ED
957 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
958 ireq->ir_rmt_addr,
2ab2ddd3
ED
959 rcu_dereference(ireq->ireq_opt));
960 rcu_read_unlock();
b9df3cb8 961 err = net_xmit_eval(err);
1da177e4
LT
962 }
963
1da177e4
LT
964 return err;
965}
966
967/*
60236fdd 968 * IPv4 request_sock destructor.
1da177e4 969 */
60236fdd 970static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 971{
c92e8c02 972 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1da177e4
LT
973}
974
cfb6eeb4
YH
975#ifdef CONFIG_TCP_MD5SIG
976/*
977 * RFC2385 MD5 checksumming requires a mapping of
978 * IP address->MD5 Key.
979 * We need to maintain these in the sk structure.
980 */
981
921f9a0f 982DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
6015c71e
ED
983EXPORT_SYMBOL(tcp_md5_needed);
984
cfb6eeb4 985/* Find the Key structure for an address. */
6015c71e
ED
986struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
987 const union tcp_md5_addr *addr,
988 int family)
cfb6eeb4 989{
fd3a154a 990 const struct tcp_sock *tp = tcp_sk(sk);
a915da9b 991 struct tcp_md5sig_key *key;
fd3a154a 992 const struct tcp_md5sig_info *md5sig;
6797318e
ID
993 __be32 mask;
994 struct tcp_md5sig_key *best_match = NULL;
995 bool match;
cfb6eeb4 996
a8afca03
ED
997 /* caller either holds rcu_read_lock() or socket lock */
998 md5sig = rcu_dereference_check(tp->md5sig_info,
1e1d04e6 999 lockdep_sock_is_held(sk));
a8afca03 1000 if (!md5sig)
cfb6eeb4 1001 return NULL;
083a0326 1002
b67bfe0d 1003 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
a915da9b
ED
1004 if (key->family != family)
1005 continue;
6797318e
ID
1006
1007 if (family == AF_INET) {
1008 mask = inet_make_mask(key->prefixlen);
1009 match = (key->addr.a4.s_addr & mask) ==
1010 (addr->a4.s_addr & mask);
1011#if IS_ENABLED(CONFIG_IPV6)
1012 } else if (family == AF_INET6) {
1013 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1014 key->prefixlen);
1015#endif
1016 } else {
1017 match = false;
1018 }
1019
1020 if (match && (!best_match ||
1021 key->prefixlen > best_match->prefixlen))
1022 best_match = key;
1023 }
1024 return best_match;
1025}
6015c71e 1026EXPORT_SYMBOL(__tcp_md5_do_lookup);
6797318e 1027
e8f37d57
WF
1028static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1029 const union tcp_md5_addr *addr,
1030 int family, u8 prefixlen)
6797318e
ID
1031{
1032 const struct tcp_sock *tp = tcp_sk(sk);
1033 struct tcp_md5sig_key *key;
1034 unsigned int size = sizeof(struct in_addr);
1035 const struct tcp_md5sig_info *md5sig;
1036
1037 /* caller either holds rcu_read_lock() or socket lock */
1038 md5sig = rcu_dereference_check(tp->md5sig_info,
1039 lockdep_sock_is_held(sk));
1040 if (!md5sig)
1041 return NULL;
1042#if IS_ENABLED(CONFIG_IPV6)
1043 if (family == AF_INET6)
1044 size = sizeof(struct in6_addr);
1045#endif
1046 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1047 if (key->family != family)
1048 continue;
1049 if (!memcmp(&key->addr, addr, size) &&
1050 key->prefixlen == prefixlen)
a915da9b 1051 return key;
cfb6eeb4
YH
1052 }
1053 return NULL;
1054}
1055
b83e3deb 1056struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
fd3a154a 1057 const struct sock *addr_sk)
cfb6eeb4 1058{
b52e6921 1059 const union tcp_md5_addr *addr;
a915da9b 1060
b52e6921 1061 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
a915da9b 1062 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4 1063}
cfb6eeb4
YH
1064EXPORT_SYMBOL(tcp_v4_md5_lookup);
1065
cfb6eeb4 1066/* This can be called on a newly created socket, from other files */
a915da9b 1067int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
6797318e
ID
1068 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1069 gfp_t gfp)
cfb6eeb4
YH
1070{
1071 /* Add Key to the list */
b0a713e9 1072 struct tcp_md5sig_key *key;
cfb6eeb4 1073 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1074 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1075
6797318e 1076 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
cfb6eeb4
YH
1077 if (key) {
1078 /* Pre-existing entry - just update that one. */
a915da9b 1079 memcpy(key->key, newkey, newkeylen);
b0a713e9 1080 key->keylen = newkeylen;
a915da9b
ED
1081 return 0;
1082 }
260fcbeb 1083
a8afca03 1084 md5sig = rcu_dereference_protected(tp->md5sig_info,
1e1d04e6 1085 lockdep_sock_is_held(sk));
a915da9b
ED
1086 if (!md5sig) {
1087 md5sig = kmalloc(sizeof(*md5sig), gfp);
1088 if (!md5sig)
cfb6eeb4 1089 return -ENOMEM;
cfb6eeb4 1090
a915da9b
ED
1091 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1092 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 1093 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 1094 }
cfb6eeb4 1095
5f3d9cb2 1096 key = sock_kmalloc(sk, sizeof(*key), gfp);
a915da9b
ED
1097 if (!key)
1098 return -ENOMEM;
71cea17e 1099 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 1100 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 1101 return -ENOMEM;
cfb6eeb4 1102 }
a915da9b
ED
1103
1104 memcpy(key->key, newkey, newkeylen);
1105 key->keylen = newkeylen;
1106 key->family = family;
6797318e 1107 key->prefixlen = prefixlen;
a915da9b
ED
1108 memcpy(&key->addr, addr,
1109 (family == AF_INET6) ? sizeof(struct in6_addr) :
1110 sizeof(struct in_addr));
1111 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
1112 return 0;
1113}
a915da9b 1114EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 1115
6797318e
ID
1116int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1117 u8 prefixlen)
cfb6eeb4 1118{
a915da9b
ED
1119 struct tcp_md5sig_key *key;
1120
6797318e 1121 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
a915da9b
ED
1122 if (!key)
1123 return -ENOENT;
1124 hlist_del_rcu(&key->node);
5f3d9cb2 1125 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1126 kfree_rcu(key, rcu);
a915da9b 1127 return 0;
cfb6eeb4 1128}
a915da9b 1129EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 1130
e0683e70 1131static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
1132{
1133 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1134 struct tcp_md5sig_key *key;
b67bfe0d 1135 struct hlist_node *n;
a8afca03 1136 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1137
a8afca03
ED
1138 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1139
b67bfe0d 1140 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1141 hlist_del_rcu(&key->node);
5f3d9cb2 1142 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1143 kfree_rcu(key, rcu);
cfb6eeb4
YH
1144 }
1145}
1146
8917a777
ID
1147static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1148 char __user *optval, int optlen)
cfb6eeb4
YH
1149{
1150 struct tcp_md5sig cmd;
1151 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
8917a777 1152 u8 prefixlen = 32;
cfb6eeb4
YH
1153
1154 if (optlen < sizeof(cmd))
1155 return -EINVAL;
1156
7174259e 1157 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1158 return -EFAULT;
1159
1160 if (sin->sin_family != AF_INET)
1161 return -EINVAL;
1162
8917a777
ID
1163 if (optname == TCP_MD5SIG_EXT &&
1164 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1165 prefixlen = cmd.tcpm_prefixlen;
1166 if (prefixlen > 32)
1167 return -EINVAL;
1168 }
1169
64a124ed 1170 if (!cmd.tcpm_keylen)
a915da9b 1171 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
8917a777 1172 AF_INET, prefixlen);
cfb6eeb4
YH
1173
1174 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1175 return -EINVAL;
1176
a915da9b 1177 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
8917a777 1178 AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
a915da9b 1179 GFP_KERNEL);
cfb6eeb4
YH
1180}
1181
19689e38
ED
1182static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1183 __be32 daddr, __be32 saddr,
1184 const struct tcphdr *th, int nbytes)
cfb6eeb4 1185{
cfb6eeb4 1186 struct tcp4_pseudohdr *bp;
49a72dfb 1187 struct scatterlist sg;
19689e38 1188 struct tcphdr *_th;
cfb6eeb4 1189
19689e38 1190 bp = hp->scratch;
cfb6eeb4
YH
1191 bp->saddr = saddr;
1192 bp->daddr = daddr;
1193 bp->pad = 0;
076fb722 1194 bp->protocol = IPPROTO_TCP;
49a72dfb 1195 bp->len = cpu_to_be16(nbytes);
c7da57a1 1196
19689e38
ED
1197 _th = (struct tcphdr *)(bp + 1);
1198 memcpy(_th, th, sizeof(*th));
1199 _th->check = 0;
1200
1201 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1202 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1203 sizeof(*bp) + sizeof(*th));
cf80e0e4 1204 return crypto_ahash_update(hp->md5_req);
49a72dfb
AL
1205}
1206
a915da9b 1207static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1208 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1209{
1210 struct tcp_md5sig_pool *hp;
cf80e0e4 1211 struct ahash_request *req;
49a72dfb
AL
1212
1213 hp = tcp_get_md5sig_pool();
1214 if (!hp)
1215 goto clear_hash_noput;
cf80e0e4 1216 req = hp->md5_req;
49a72dfb 1217
cf80e0e4 1218 if (crypto_ahash_init(req))
49a72dfb 1219 goto clear_hash;
19689e38 1220 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
49a72dfb
AL
1221 goto clear_hash;
1222 if (tcp_md5_hash_key(hp, key))
1223 goto clear_hash;
cf80e0e4
HX
1224 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1225 if (crypto_ahash_final(req))
cfb6eeb4
YH
1226 goto clear_hash;
1227
cfb6eeb4 1228 tcp_put_md5sig_pool();
cfb6eeb4 1229 return 0;
49a72dfb 1230
cfb6eeb4
YH
1231clear_hash:
1232 tcp_put_md5sig_pool();
1233clear_hash_noput:
1234 memset(md5_hash, 0, 16);
49a72dfb 1235 return 1;
cfb6eeb4
YH
1236}
1237
39f8e58e
ED
1238int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1239 const struct sock *sk,
318cf7aa 1240 const struct sk_buff *skb)
cfb6eeb4 1241{
49a72dfb 1242 struct tcp_md5sig_pool *hp;
cf80e0e4 1243 struct ahash_request *req;
318cf7aa 1244 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1245 __be32 saddr, daddr;
1246
39f8e58e
ED
1247 if (sk) { /* valid for establish/request sockets */
1248 saddr = sk->sk_rcv_saddr;
1249 daddr = sk->sk_daddr;
cfb6eeb4 1250 } else {
49a72dfb
AL
1251 const struct iphdr *iph = ip_hdr(skb);
1252 saddr = iph->saddr;
1253 daddr = iph->daddr;
cfb6eeb4 1254 }
49a72dfb
AL
1255
1256 hp = tcp_get_md5sig_pool();
1257 if (!hp)
1258 goto clear_hash_noput;
cf80e0e4 1259 req = hp->md5_req;
49a72dfb 1260
cf80e0e4 1261 if (crypto_ahash_init(req))
49a72dfb
AL
1262 goto clear_hash;
1263
19689e38 1264 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
49a72dfb
AL
1265 goto clear_hash;
1266 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1267 goto clear_hash;
1268 if (tcp_md5_hash_key(hp, key))
1269 goto clear_hash;
cf80e0e4
HX
1270 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1271 if (crypto_ahash_final(req))
49a72dfb
AL
1272 goto clear_hash;
1273
1274 tcp_put_md5sig_pool();
1275 return 0;
1276
1277clear_hash:
1278 tcp_put_md5sig_pool();
1279clear_hash_noput:
1280 memset(md5_hash, 0, 16);
1281 return 1;
cfb6eeb4 1282}
49a72dfb 1283EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1284
ba8e275a
ED
1285#endif
1286
ff74e23f 1287/* Called with rcu_read_lock() */
ba8e275a 1288static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
ff74e23f 1289 const struct sk_buff *skb)
cfb6eeb4 1290{
ba8e275a 1291#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4
YH
1292 /*
1293 * This gets called for each TCP segment that arrives
1294 * so we want to be efficient.
1295 * We have 3 drop cases:
1296 * o No MD5 hash and one expected.
1297 * o MD5 hash and we're not expecting one.
1298 * o MD5 hash and its wrong.
1299 */
cf533ea5 1300 const __u8 *hash_location = NULL;
cfb6eeb4 1301 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1302 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1303 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1304 int genhash;
cfb6eeb4
YH
1305 unsigned char newhash[16];
1306
a915da9b
ED
1307 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1308 AF_INET);
7d5d5525 1309 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1310
cfb6eeb4
YH
1311 /* We've parsed the options - do we have a hash? */
1312 if (!hash_expected && !hash_location)
a2a385d6 1313 return false;
cfb6eeb4
YH
1314
1315 if (hash_expected && !hash_location) {
c10d9310 1316 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1317 return true;
cfb6eeb4
YH
1318 }
1319
1320 if (!hash_expected && hash_location) {
c10d9310 1321 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1322 return true;
cfb6eeb4
YH
1323 }
1324
1325 /* Okay, so this is hash_expected and hash_location -
1326 * so we need to calculate the checksum.
1327 */
49a72dfb
AL
1328 genhash = tcp_v4_md5_hash_skb(newhash,
1329 hash_expected,
39f8e58e 1330 NULL, skb);
cfb6eeb4
YH
1331
1332 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
72145a68 1333 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
e87cc472
JP
1334 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1335 &iph->saddr, ntohs(th->source),
1336 &iph->daddr, ntohs(th->dest),
1337 genhash ? " tcp_v4_calc_md5_hash failed"
1338 : "");
a2a385d6 1339 return true;
cfb6eeb4 1340 }
a2a385d6 1341 return false;
cfb6eeb4 1342#endif
ba8e275a
ED
1343 return false;
1344}
cfb6eeb4 1345
b40cf18e
ED
1346static void tcp_v4_init_req(struct request_sock *req,
1347 const struct sock *sk_listener,
16bea70a
OP
1348 struct sk_buff *skb)
1349{
1350 struct inet_request_sock *ireq = inet_rsk(req);
c92e8c02 1351 struct net *net = sock_net(sk_listener);
16bea70a 1352
08d2cc3b
ED
1353 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1354 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
c92e8c02 1355 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
16bea70a
OP
1356}
1357
f964629e
ED
1358static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1359 struct flowi *fl,
4396e461 1360 const struct request_sock *req)
d94e0417 1361{
4396e461 1362 return inet_csk_route_req(sk, &fl->u.ip4, req);
d94e0417
OP
1363}
1364
72a3effa 1365struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1366 .family = PF_INET,
2e6599cb 1367 .obj_size = sizeof(struct tcp_request_sock),
5db92c99 1368 .rtx_syn_ack = tcp_rtx_synack,
60236fdd
ACM
1369 .send_ack = tcp_v4_reqsk_send_ack,
1370 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1371 .send_reset = tcp_v4_send_reset,
688d1945 1372 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1373};
1374
b2e4b3de 1375static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
2aec4a29 1376 .mss_clamp = TCP_MSS_DEFAULT,
16bea70a 1377#ifdef CONFIG_TCP_MD5SIG
fd3a154a 1378 .req_md5_lookup = tcp_v4_md5_lookup,
e3afe7b7 1379 .calc_md5_hash = tcp_v4_md5_hash_skb,
b6332e6c 1380#endif
16bea70a 1381 .init_req = tcp_v4_init_req,
fb7b37a7
OP
1382#ifdef CONFIG_SYN_COOKIES
1383 .cookie_init_seq = cookie_v4_init_sequence,
1384#endif
d94e0417 1385 .route_req = tcp_v4_route_req,
84b114b9
ED
1386 .init_seq = tcp_v4_init_seq,
1387 .init_ts_off = tcp_v4_init_ts_off,
d6274bd8 1388 .send_synack = tcp_v4_send_synack,
16bea70a 1389};
cfb6eeb4 1390
1da177e4
LT
1391int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1392{
1da177e4 1393 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1394 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1395 goto drop;
1396
1fb6f159
OP
1397 return tcp_conn_request(&tcp_request_sock_ops,
1398 &tcp_request_sock_ipv4_ops, sk, skb);
1da177e4 1399
1da177e4 1400drop:
9caad864 1401 tcp_listendrop(sk);
1da177e4
LT
1402 return 0;
1403}
4bc2f18b 1404EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1405
1406
1407/*
1408 * The three way handshake has completed - we got a valid synack -
1409 * now create the new socket.
1410 */
0c27171e 1411struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
60236fdd 1412 struct request_sock *req,
5e0724d0
ED
1413 struct dst_entry *dst,
1414 struct request_sock *req_unhash,
1415 bool *own_req)
1da177e4 1416{
2e6599cb 1417 struct inet_request_sock *ireq;
1da177e4
LT
1418 struct inet_sock *newinet;
1419 struct tcp_sock *newtp;
1420 struct sock *newsk;
cfb6eeb4
YH
1421#ifdef CONFIG_TCP_MD5SIG
1422 struct tcp_md5sig_key *key;
1423#endif
f6d8bd05 1424 struct ip_options_rcu *inet_opt;
1da177e4
LT
1425
1426 if (sk_acceptq_is_full(sk))
1427 goto exit_overflow;
1428
1da177e4
LT
1429 newsk = tcp_create_openreq_child(sk, req, skb);
1430 if (!newsk)
093d2823 1431 goto exit_nonewsk;
1da177e4 1432
bcd76111 1433 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1434 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1435
1436 newtp = tcp_sk(newsk);
1437 newinet = inet_sk(newsk);
2e6599cb 1438 ireq = inet_rsk(req);
d1e559d0
ED
1439 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1440 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
6dd9a14e 1441 newsk->sk_bound_dev_if = ireq->ir_iif;
c92e8c02
ED
1442 newinet->inet_saddr = ireq->ir_loc_addr;
1443 inet_opt = rcu_dereference(ireq->ireq_opt);
1444 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
463c84b9 1445 newinet->mc_index = inet_iif(skb);
eddc9ec5 1446 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1447 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1448 inet_csk(newsk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
1449 if (inet_opt)
1450 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
a904a069 1451 newinet->inet_id = prandom_u32();
1da177e4 1452
dfd25fff
ED
1453 if (!dst) {
1454 dst = inet_csk_route_child_sock(sk, newsk, req);
1455 if (!dst)
1456 goto put_and_exit;
1457 } else {
1458 /* syncookie case : see end of cookie_v4_check() */
1459 }
0e734419
DM
1460 sk_setup_caps(newsk, dst);
1461
81164413
DB
1462 tcp_ca_openreq_child(newsk, dst);
1463
1da177e4 1464 tcp_sync_mss(newsk, dst_mtu(dst));
3541f9e8 1465 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
f5fff5dc 1466
1da177e4
LT
1467 tcp_initialize_rcv_mss(newsk);
1468
cfb6eeb4
YH
1469#ifdef CONFIG_TCP_MD5SIG
1470 /* Copy over the MD5 key from the original socket */
a915da9b
ED
1471 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1472 AF_INET);
00db4124 1473 if (key) {
cfb6eeb4
YH
1474 /*
1475 * We're using one, so create a matching key
1476 * on the newsk structure. If we fail to get
1477 * memory, then we end up not copying the key
1478 * across. Shucks.
1479 */
a915da9b 1480 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
6797318e 1481 AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
a465419b 1482 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1483 }
1484#endif
1485
0e734419
DM
1486 if (__inet_inherit_port(sk, newsk) < 0)
1487 goto put_and_exit;
5e0724d0 1488 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
c92e8c02 1489 if (likely(*own_req)) {
49a496c9 1490 tcp_move_syn(newtp, req);
c92e8c02
ED
1491 ireq->ireq_opt = NULL;
1492 } else {
1493 newinet->inet_opt = NULL;
1494 }
1da177e4
LT
1495 return newsk;
1496
1497exit_overflow:
c10d9310 1498 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1499exit_nonewsk:
1500 dst_release(dst);
1da177e4 1501exit:
9caad864 1502 tcp_listendrop(sk);
1da177e4 1503 return NULL;
0e734419 1504put_and_exit:
c92e8c02 1505 newinet->inet_opt = NULL;
e337e24d
CP
1506 inet_csk_prepare_forced_close(newsk);
1507 tcp_done(newsk);
0e734419 1508 goto exit;
1da177e4 1509}
4bc2f18b 1510EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4 1511
079096f1 1512static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1da177e4 1513{
079096f1 1514#ifdef CONFIG_SYN_COOKIES
52452c54 1515 const struct tcphdr *th = tcp_hdr(skb);
1da177e4 1516
af9b4738 1517 if (!th->syn)
461b74c3 1518 sk = cookie_v4_check(sk, skb);
1da177e4
LT
1519#endif
1520 return sk;
1521}
1522
9349d600
PP
1523u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1524 struct tcphdr *th, u32 *cookie)
1525{
1526 u16 mss = 0;
1527#ifdef CONFIG_SYN_COOKIES
1528 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1529 &tcp_request_sock_ipv4_ops, sk, th);
1530 if (mss) {
1531 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1532 tcp_synq_overflow(sk);
1533 }
1534#endif
1535 return mss;
1536}
1537
1da177e4 1538/* The socket must have it's spinlock held when we get
e994b2f0 1539 * here, unless it is a TCP_LISTEN socket.
1da177e4
LT
1540 *
1541 * We have a potential double-lock case here, so even when
1542 * doing backlog processing we use the BH locking scheme.
1543 * This is because we cannot sleep with the original spinlock
1544 * held.
1545 */
1546int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1547{
cfb6eeb4 1548 struct sock *rsk;
cfb6eeb4 1549
1da177e4 1550 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1551 struct dst_entry *dst = sk->sk_rx_dst;
1552
bdeab991 1553 sock_rps_save_rxhash(sk, skb);
3d97379a 1554 sk_mark_napi_id(sk, skb);
404e0a8b 1555 if (dst) {
505fbcf0 1556 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
51456b29 1557 !dst->ops->check(dst, 0)) {
92101b3b
DM
1558 dst_release(dst);
1559 sk->sk_rx_dst = NULL;
1560 }
1561 }
3d97d88e 1562 tcp_rcv_established(sk, skb);
1da177e4
LT
1563 return 0;
1564 }
1565
12e25e10 1566 if (tcp_checksum_complete(skb))
1da177e4
LT
1567 goto csum_err;
1568
1569 if (sk->sk_state == TCP_LISTEN) {
079096f1
ED
1570 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1571
1da177e4
LT
1572 if (!nsk)
1573 goto discard;
1da177e4 1574 if (nsk != sk) {
cfb6eeb4
YH
1575 if (tcp_child_process(sk, nsk, skb)) {
1576 rsk = nsk;
1da177e4 1577 goto reset;
cfb6eeb4 1578 }
1da177e4
LT
1579 return 0;
1580 }
ca55158c 1581 } else
bdeab991 1582 sock_rps_save_rxhash(sk, skb);
ca55158c 1583
72ab4a86 1584 if (tcp_rcv_state_process(sk, skb)) {
cfb6eeb4 1585 rsk = sk;
1da177e4 1586 goto reset;
cfb6eeb4 1587 }
1da177e4
LT
1588 return 0;
1589
1590reset:
cfb6eeb4 1591 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1592discard:
1593 kfree_skb(skb);
1594 /* Be careful here. If this function gets more complicated and
1595 * gcc suffers from register pressure on the x86, sk (in %ebx)
1596 * might be destroyed here. This current version compiles correctly,
1597 * but you have been warned.
1598 */
1599 return 0;
1600
1601csum_err:
c10d9310
ED
1602 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1603 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1604 goto discard;
1605}
4bc2f18b 1606EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1607
7487449c 1608int tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1609{
41063e9d
DM
1610 const struct iphdr *iph;
1611 const struct tcphdr *th;
1612 struct sock *sk;
41063e9d 1613
41063e9d 1614 if (skb->pkt_type != PACKET_HOST)
7487449c 1615 return 0;
41063e9d 1616
45f00f99 1617 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
7487449c 1618 return 0;
41063e9d
DM
1619
1620 iph = ip_hdr(skb);
45f00f99 1621 th = tcp_hdr(skb);
41063e9d
DM
1622
1623 if (th->doff < sizeof(struct tcphdr) / 4)
7487449c 1624 return 0;
41063e9d 1625
45f00f99 1626 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1627 iph->saddr, th->source,
7011d085 1628 iph->daddr, ntohs(th->dest),
3fa6f616 1629 skb->skb_iif, inet_sdif(skb));
41063e9d
DM
1630 if (sk) {
1631 skb->sk = sk;
1632 skb->destructor = sock_edemux;
f7e4eb03 1633 if (sk_fullsock(sk)) {
d0c294c5 1634 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
505fbcf0 1635
41063e9d
DM
1636 if (dst)
1637 dst = dst_check(dst, 0);
92101b3b 1638 if (dst &&
505fbcf0 1639 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1640 skb_dst_set_noref(skb, dst);
41063e9d
DM
1641 }
1642 }
7487449c 1643 return 0;
41063e9d
DM
1644}
1645
c9c33212
ED
1646bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1647{
8265792b 1648 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
4f693b55
ED
1649 struct skb_shared_info *shinfo;
1650 const struct tcphdr *th;
1651 struct tcphdr *thtail;
1652 struct sk_buff *tail;
1653 unsigned int hdrlen;
1654 bool fragstolen;
1655 u32 gso_segs;
1656 int delta;
c9c33212
ED
1657
1658 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1659 * we can fix skb->truesize to its real value to avoid future drops.
1660 * This is valid because skb is not yet charged to the socket.
1661 * It has been noticed pure SACK packets were sometimes dropped
1662 * (if cooked by drivers without copybreak feature).
1663 */
60b1af33 1664 skb_condense(skb);
c9c33212 1665
ade9628e
ED
1666 skb_dst_drop(skb);
1667
4f693b55
ED
1668 if (unlikely(tcp_checksum_complete(skb))) {
1669 bh_unlock_sock(sk);
1670 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1671 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1672 return true;
1673 }
1674
1675 /* Attempt coalescing to last skb in backlog, even if we are
1676 * above the limits.
1677 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1678 */
1679 th = (const struct tcphdr *)skb->data;
1680 hdrlen = th->doff * 4;
1681 shinfo = skb_shinfo(skb);
1682
1683 if (!shinfo->gso_size)
1684 shinfo->gso_size = skb->len - hdrlen;
1685
1686 if (!shinfo->gso_segs)
1687 shinfo->gso_segs = 1;
1688
1689 tail = sk->sk_backlog.tail;
1690 if (!tail)
1691 goto no_coalesce;
1692 thtail = (struct tcphdr *)tail->data;
1693
1694 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1695 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1696 ((TCP_SKB_CB(tail)->tcp_flags |
ca2fe295
ED
1697 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1698 !((TCP_SKB_CB(tail)->tcp_flags &
1699 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
4f693b55
ED
1700 ((TCP_SKB_CB(tail)->tcp_flags ^
1701 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1702#ifdef CONFIG_TLS_DEVICE
1703 tail->decrypted != skb->decrypted ||
1704#endif
1705 thtail->doff != th->doff ||
1706 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1707 goto no_coalesce;
1708
1709 __skb_pull(skb, hdrlen);
1710 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1711 thtail->window = th->window;
1712
1713 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1714
1715 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1716 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1717
ca2fe295
ED
1718 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1719 * thtail->fin, so that the fast path in tcp_rcv_established()
1720 * is not entered if we append a packet with a FIN.
1721 * SYN, RST, URG are not present.
1722 * ACK is set on both packets.
1723 * PSH : we do not really care in TCP stack,
1724 * at least for 'GRO' packets.
1725 */
1726 thtail->fin |= th->fin;
4f693b55
ED
1727 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1728
1729 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1730 TCP_SKB_CB(tail)->has_rxtstamp = true;
1731 tail->tstamp = skb->tstamp;
1732 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1733 }
1734
1735 /* Not as strict as GRO. We only need to carry mss max value */
1736 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1737 skb_shinfo(tail)->gso_size);
1738
1739 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1740 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1741
1742 sk->sk_backlog.len += delta;
1743 __NET_INC_STATS(sock_net(sk),
1744 LINUX_MIB_TCPBACKLOGCOALESCE);
1745 kfree_skb_partial(skb, fragstolen);
1746 return false;
1747 }
1748 __skb_push(skb, hdrlen);
1749
1750no_coalesce:
1751 /* Only socket owner can try to collapse/prune rx queues
1752 * to reduce memory overhead, so add a little headroom here.
1753 * Few sockets backlog are possibly concurrently non empty.
1754 */
1755 limit += 64*1024;
1756
c9c33212
ED
1757 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1758 bh_unlock_sock(sk);
1759 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1760 return true;
1761 }
1762 return false;
1763}
1764EXPORT_SYMBOL(tcp_add_backlog);
1765
ac6e7800
ED
1766int tcp_filter(struct sock *sk, struct sk_buff *skb)
1767{
1768 struct tcphdr *th = (struct tcphdr *)skb->data;
ac6e7800 1769
f2feaefd 1770 return sk_filter_trim_cap(sk, skb, th->doff * 4);
ac6e7800
ED
1771}
1772EXPORT_SYMBOL(tcp_filter);
1773
eeea10b8
ED
1774static void tcp_v4_restore_cb(struct sk_buff *skb)
1775{
1776 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1777 sizeof(struct inet_skb_parm));
1778}
1779
1780static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1781 const struct tcphdr *th)
1782{
1783 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1784 * barrier() makes sure compiler wont play fool^Waliasing games.
1785 */
1786 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1787 sizeof(struct inet_skb_parm));
1788 barrier();
1789
1790 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1791 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1792 skb->len - th->doff * 4);
1793 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1794 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1795 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1796 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1797 TCP_SKB_CB(skb)->sacked = 0;
1798 TCP_SKB_CB(skb)->has_rxtstamp =
1799 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1800}
1801
1da177e4
LT
1802/*
1803 * From tcp_input.c
1804 */
1805
1806int tcp_v4_rcv(struct sk_buff *skb)
1807{
3b24d854 1808 struct net *net = dev_net(skb->dev);
8b27dae5 1809 struct sk_buff *skb_to_free;
3fa6f616 1810 int sdif = inet_sdif(skb);
eddc9ec5 1811 const struct iphdr *iph;
cf533ea5 1812 const struct tcphdr *th;
3b24d854 1813 bool refcounted;
1da177e4
LT
1814 struct sock *sk;
1815 int ret;
1816
1817 if (skb->pkt_type != PACKET_HOST)
1818 goto discard_it;
1819
1820 /* Count it even if it's bad */
90bbcc60 1821 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1da177e4
LT
1822
1823 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1824 goto discard_it;
1825
ea1627c2 1826 th = (const struct tcphdr *)skb->data;
1da177e4 1827
ea1627c2 1828 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1da177e4
LT
1829 goto bad_packet;
1830 if (!pskb_may_pull(skb, th->doff * 4))
1831 goto discard_it;
1832
1833 /* An explanation is required here, I think.
1834 * Packet length and doff are validated by header prediction,
caa20d9a 1835 * provided case of th->doff==0 is eliminated.
1da177e4 1836 * So, we defer the checks. */
ed70fcfc
TH
1837
1838 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
6a5dc9e5 1839 goto csum_error;
1da177e4 1840
ea1627c2 1841 th = (const struct tcphdr *)skb->data;
eddc9ec5 1842 iph = ip_hdr(skb);
4bdc3d66 1843lookup:
a583636a 1844 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
3fa6f616 1845 th->dest, sdif, &refcounted);
1da177e4
LT
1846 if (!sk)
1847 goto no_tcp_socket;
1848
bb134d5d
ED
1849process:
1850 if (sk->sk_state == TCP_TIME_WAIT)
1851 goto do_time_wait;
1852
079096f1
ED
1853 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1854 struct request_sock *req = inet_reqsk(sk);
e0f9759f 1855 bool req_stolen = false;
7716682c 1856 struct sock *nsk;
079096f1
ED
1857
1858 sk = req->rsk_listener;
72923555 1859 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
e65c332d 1860 sk_drops_add(sk, skb);
72923555
ED
1861 reqsk_put(req);
1862 goto discard_it;
1863 }
4fd44a98
FL
1864 if (tcp_checksum_complete(skb)) {
1865 reqsk_put(req);
1866 goto csum_error;
1867 }
7716682c 1868 if (unlikely(sk->sk_state != TCP_LISTEN)) {
f03f2e15 1869 inet_csk_reqsk_queue_drop_and_put(sk, req);
4bdc3d66
ED
1870 goto lookup;
1871 }
3b24d854
ED
1872 /* We own a reference on the listener, increase it again
1873 * as we might lose it too soon.
1874 */
7716682c 1875 sock_hold(sk);
3b24d854 1876 refcounted = true;
1f3b359f 1877 nsk = NULL;
eeea10b8
ED
1878 if (!tcp_filter(sk, skb)) {
1879 th = (const struct tcphdr *)skb->data;
1880 iph = ip_hdr(skb);
1881 tcp_v4_fill_cb(skb, iph, th);
e0f9759f 1882 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
eeea10b8 1883 }
079096f1
ED
1884 if (!nsk) {
1885 reqsk_put(req);
e0f9759f
ED
1886 if (req_stolen) {
1887 /* Another cpu got exclusive access to req
1888 * and created a full blown socket.
1889 * Try to feed this packet to this socket
1890 * instead of discarding it.
1891 */
1892 tcp_v4_restore_cb(skb);
1893 sock_put(sk);
1894 goto lookup;
1895 }
7716682c 1896 goto discard_and_relse;
079096f1
ED
1897 }
1898 if (nsk == sk) {
079096f1 1899 reqsk_put(req);
eeea10b8 1900 tcp_v4_restore_cb(skb);
079096f1
ED
1901 } else if (tcp_child_process(sk, nsk, skb)) {
1902 tcp_v4_send_reset(nsk, skb);
7716682c 1903 goto discard_and_relse;
079096f1 1904 } else {
7716682c 1905 sock_put(sk);
079096f1
ED
1906 return 0;
1907 }
1908 }
6cce09f8 1909 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 1910 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1911 goto discard_and_relse;
6cce09f8 1912 }
d218d111 1913
1da177e4
LT
1914 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1915 goto discard_and_relse;
9ea88a15 1916
9ea88a15
DP
1917 if (tcp_v4_inbound_md5_hash(sk, skb))
1918 goto discard_and_relse;
9ea88a15 1919
895b5c9f 1920 nf_reset_ct(skb);
1da177e4 1921
ac6e7800 1922 if (tcp_filter(sk, skb))
1da177e4 1923 goto discard_and_relse;
ac6e7800
ED
1924 th = (const struct tcphdr *)skb->data;
1925 iph = ip_hdr(skb);
eeea10b8 1926 tcp_v4_fill_cb(skb, iph, th);
1da177e4
LT
1927
1928 skb->dev = NULL;
1929
e994b2f0
ED
1930 if (sk->sk_state == TCP_LISTEN) {
1931 ret = tcp_v4_do_rcv(sk, skb);
1932 goto put_and_return;
1933 }
1934
1935 sk_incoming_cpu_update(sk);
1936
c6366184 1937 bh_lock_sock_nested(sk);
a44d6eac 1938 tcp_segs_in(tcp_sk(sk), skb);
1da177e4
LT
1939 ret = 0;
1940 if (!sock_owned_by_user(sk)) {
8b27dae5
ED
1941 skb_to_free = sk->sk_rx_skb_cache;
1942 sk->sk_rx_skb_cache = NULL;
e7942d06 1943 ret = tcp_v4_do_rcv(sk, skb);
8b27dae5
ED
1944 } else {
1945 if (tcp_add_backlog(sk, skb))
1946 goto discard_and_relse;
1947 skb_to_free = NULL;
6b03a53a 1948 }
1da177e4 1949 bh_unlock_sock(sk);
8b27dae5
ED
1950 if (skb_to_free)
1951 __kfree_skb(skb_to_free);
1da177e4 1952
e994b2f0 1953put_and_return:
3b24d854
ED
1954 if (refcounted)
1955 sock_put(sk);
1da177e4
LT
1956
1957 return ret;
1958
1959no_tcp_socket:
1960 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1961 goto discard_it;
1962
eeea10b8
ED
1963 tcp_v4_fill_cb(skb, iph, th);
1964
12e25e10 1965 if (tcp_checksum_complete(skb)) {
6a5dc9e5 1966csum_error:
90bbcc60 1967 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1da177e4 1968bad_packet:
90bbcc60 1969 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1da177e4 1970 } else {
cfb6eeb4 1971 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
1972 }
1973
1974discard_it:
1975 /* Discard frame. */
1976 kfree_skb(skb);
e905a9ed 1977 return 0;
1da177e4
LT
1978
1979discard_and_relse:
532182cd 1980 sk_drops_add(sk, skb);
3b24d854
ED
1981 if (refcounted)
1982 sock_put(sk);
1da177e4
LT
1983 goto discard_it;
1984
1985do_time_wait:
1986 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 1987 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1988 goto discard_it;
1989 }
1990
eeea10b8
ED
1991 tcp_v4_fill_cb(skb, iph, th);
1992
6a5dc9e5
ED
1993 if (tcp_checksum_complete(skb)) {
1994 inet_twsk_put(inet_twsk(sk));
1995 goto csum_error;
1da177e4 1996 }
9469c7b4 1997 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 1998 case TCP_TW_SYN: {
c346dca1 1999 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
a583636a
CG
2000 &tcp_hashinfo, skb,
2001 __tcp_hdrlen(th),
da5e3630 2002 iph->saddr, th->source,
eddc9ec5 2003 iph->daddr, th->dest,
3fa6f616
DA
2004 inet_iif(skb),
2005 sdif);
1da177e4 2006 if (sk2) {
dbe7faa4 2007 inet_twsk_deschedule_put(inet_twsk(sk));
1da177e4 2008 sk = sk2;
eeea10b8 2009 tcp_v4_restore_cb(skb);
3b24d854 2010 refcounted = false;
1da177e4
LT
2011 goto process;
2012 }
1da177e4 2013 }
fcfd6dfa
GS
2014 /* to ACK */
2015 /* fall through */
1da177e4
LT
2016 case TCP_TW_ACK:
2017 tcp_v4_timewait_ack(sk, skb);
2018 break;
2019 case TCP_TW_RST:
271c3b9b
FW
2020 tcp_v4_send_reset(sk, skb);
2021 inet_twsk_deschedule_put(inet_twsk(sk));
2022 goto discard_it;
1da177e4
LT
2023 case TCP_TW_SUCCESS:;
2024 }
2025 goto discard_it;
2026}
2027
ccb7c410
DM
2028static struct timewait_sock_ops tcp_timewait_sock_ops = {
2029 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2030 .twsk_unique = tcp_twsk_unique,
2031 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 2032};
1da177e4 2033
63d02d15 2034void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
2035{
2036 struct dst_entry *dst = skb_dst(skb);
2037
5037e9ef 2038 if (dst && dst_hold_safe(dst)) {
ca777eff
ED
2039 sk->sk_rx_dst = dst;
2040 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2041 }
5d299f3d 2042}
63d02d15 2043EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 2044
3b401a81 2045const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
2046 .queue_xmit = ip_queue_xmit,
2047 .send_check = tcp_v4_send_check,
2048 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 2049 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
2050 .conn_request = tcp_v4_conn_request,
2051 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
2052 .net_header_len = sizeof(struct iphdr),
2053 .setsockopt = ip_setsockopt,
2054 .getsockopt = ip_getsockopt,
2055 .addr2sockaddr = inet_csk_addr2sockaddr,
2056 .sockaddr_len = sizeof(struct sockaddr_in),
3fdadf7d 2057#ifdef CONFIG_COMPAT
543d9cfe
ACM
2058 .compat_setsockopt = compat_ip_setsockopt,
2059 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 2060#endif
4fab9071 2061 .mtu_reduced = tcp_v4_mtu_reduced,
1da177e4 2062};
4bc2f18b 2063EXPORT_SYMBOL(ipv4_specific);
1da177e4 2064
cfb6eeb4 2065#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 2066static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 2067 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 2068 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 2069 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 2070};
b6332e6c 2071#endif
cfb6eeb4 2072
1da177e4
LT
2073/* NOTE: A lot of things set to zero explicitly by call to
2074 * sk_alloc() so need not be done here.
2075 */
2076static int tcp_v4_init_sock(struct sock *sk)
2077{
6687e988 2078 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 2079
900f65d3 2080 tcp_init_sock(sk);
1da177e4 2081
8292a17a 2082 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 2083
cfb6eeb4 2084#ifdef CONFIG_TCP_MD5SIG
ac807fa8 2085 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 2086#endif
1da177e4 2087
1da177e4
LT
2088 return 0;
2089}
2090
7d06b2e0 2091void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
2092{
2093 struct tcp_sock *tp = tcp_sk(sk);
2094
e1a4aa50
SL
2095 trace_tcp_destroy_sock(sk);
2096
1da177e4
LT
2097 tcp_clear_xmit_timers(sk);
2098
6687e988 2099 tcp_cleanup_congestion_control(sk);
317a76f9 2100
734942cc
DW
2101 tcp_cleanup_ulp(sk);
2102
1da177e4 2103 /* Cleanup up the write buffer. */
fe067e8a 2104 tcp_write_queue_purge(sk);
1da177e4 2105
cf1ef3f0
WW
2106 /* Check if we want to disable active TFO */
2107 tcp_fastopen_active_disable_ofo_check(sk);
2108
1da177e4 2109 /* Cleans up our, hopefully empty, out_of_order_queue. */
9f5afeae 2110 skb_rbtree_purge(&tp->out_of_order_queue);
1da177e4 2111
cfb6eeb4
YH
2112#ifdef CONFIG_TCP_MD5SIG
2113 /* Clean up the MD5 key list, if any */
2114 if (tp->md5sig_info) {
a915da9b 2115 tcp_clear_md5_list(sk);
fb7df5e4 2116 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
cfb6eeb4
YH
2117 tp->md5sig_info = NULL;
2118 }
2119#endif
1a2449a8 2120
1da177e4 2121 /* Clean up a referenced TCP bind bucket. */
463c84b9 2122 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 2123 inet_put_port(sk);
1da177e4 2124
d983ea6f 2125 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
435cf559 2126
cf60af03
YC
2127 /* If socket is aborted during connect operation */
2128 tcp_free_fastopen_req(tp);
1fba70e5 2129 tcp_fastopen_destroy_cipher(sk);
cd8ae852 2130 tcp_saved_syn_free(tp);
cf60af03 2131
180d8cd9 2132 sk_sockets_allocated_dec(sk);
1da177e4 2133}
1da177e4
LT
2134EXPORT_SYMBOL(tcp_v4_destroy_sock);
2135
2136#ifdef CONFIG_PROC_FS
2137/* Proc filesystem TCP sock list dumping. */
2138
a8b690f9
TH
2139/*
2140 * Get next listener socket follow cur. If cur is NULL, get first socket
2141 * starting from bucket given in st->bucket; when st->bucket is zero the
2142 * very first socket in the hash table is returned.
2143 */
1da177e4
LT
2144static void *listening_get_next(struct seq_file *seq, void *cur)
2145{
37d849bb 2146 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
5799de0b 2147 struct tcp_iter_state *st = seq->private;
a4146b1b 2148 struct net *net = seq_file_net(seq);
3b24d854 2149 struct inet_listen_hashbucket *ilb;
3b24d854 2150 struct sock *sk = cur;
1da177e4
LT
2151
2152 if (!sk) {
3b24d854 2153get_head:
a8b690f9 2154 ilb = &tcp_hashinfo.listening_hash[st->bucket];
9652dc2e 2155 spin_lock(&ilb->lock);
3b24d854 2156 sk = sk_head(&ilb->head);
a8b690f9 2157 st->offset = 0;
1da177e4
LT
2158 goto get_sk;
2159 }
5caea4ea 2160 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 2161 ++st->num;
a8b690f9 2162 ++st->offset;
1da177e4 2163
3b24d854 2164 sk = sk_next(sk);
1da177e4 2165get_sk:
3b24d854 2166 sk_for_each_from(sk) {
8475ef9f
PE
2167 if (!net_eq(sock_net(sk), net))
2168 continue;
37d849bb 2169 if (sk->sk_family == afinfo->family)
3b24d854 2170 return sk;
1da177e4 2171 }
9652dc2e 2172 spin_unlock(&ilb->lock);
a8b690f9 2173 st->offset = 0;
3b24d854
ED
2174 if (++st->bucket < INET_LHTABLE_SIZE)
2175 goto get_head;
2176 return NULL;
1da177e4
LT
2177}
2178
2179static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2180{
a8b690f9
TH
2181 struct tcp_iter_state *st = seq->private;
2182 void *rc;
2183
2184 st->bucket = 0;
2185 st->offset = 0;
2186 rc = listening_get_next(seq, NULL);
1da177e4
LT
2187
2188 while (rc && *pos) {
2189 rc = listening_get_next(seq, rc);
2190 --*pos;
2191 }
2192 return rc;
2193}
2194
05dbc7b5 2195static inline bool empty_bucket(const struct tcp_iter_state *st)
6eac5604 2196{
05dbc7b5 2197 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
6eac5604
AK
2198}
2199
a8b690f9
TH
2200/*
2201 * Get first established socket starting from bucket given in st->bucket.
2202 * If st->bucket is zero, the very first socket in the hash is returned.
2203 */
1da177e4
LT
2204static void *established_get_first(struct seq_file *seq)
2205{
37d849bb 2206 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
5799de0b 2207 struct tcp_iter_state *st = seq->private;
a4146b1b 2208 struct net *net = seq_file_net(seq);
1da177e4
LT
2209 void *rc = NULL;
2210
a8b690f9
TH
2211 st->offset = 0;
2212 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 2213 struct sock *sk;
3ab5aee7 2214 struct hlist_nulls_node *node;
9db66bdc 2215 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 2216
6eac5604
AK
2217 /* Lockless fast path for the common case of empty buckets */
2218 if (empty_bucket(st))
2219 continue;
2220
9db66bdc 2221 spin_lock_bh(lock);
3ab5aee7 2222 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
37d849bb 2223 if (sk->sk_family != afinfo->family ||
878628fb 2224 !net_eq(sock_net(sk), net)) {
1da177e4
LT
2225 continue;
2226 }
2227 rc = sk;
2228 goto out;
2229 }
9db66bdc 2230 spin_unlock_bh(lock);
1da177e4
LT
2231 }
2232out:
2233 return rc;
2234}
2235
2236static void *established_get_next(struct seq_file *seq, void *cur)
2237{
37d849bb 2238 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
1da177e4 2239 struct sock *sk = cur;
3ab5aee7 2240 struct hlist_nulls_node *node;
5799de0b 2241 struct tcp_iter_state *st = seq->private;
a4146b1b 2242 struct net *net = seq_file_net(seq);
1da177e4
LT
2243
2244 ++st->num;
a8b690f9 2245 ++st->offset;
1da177e4 2246
05dbc7b5 2247 sk = sk_nulls_next(sk);
1da177e4 2248
3ab5aee7 2249 sk_nulls_for_each_from(sk, node) {
37d849bb
CH
2250 if (sk->sk_family == afinfo->family &&
2251 net_eq(sock_net(sk), net))
05dbc7b5 2252 return sk;
1da177e4
LT
2253 }
2254
05dbc7b5
ED
2255 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2256 ++st->bucket;
2257 return established_get_first(seq);
1da177e4
LT
2258}
2259
2260static void *established_get_idx(struct seq_file *seq, loff_t pos)
2261{
a8b690f9
TH
2262 struct tcp_iter_state *st = seq->private;
2263 void *rc;
2264
2265 st->bucket = 0;
2266 rc = established_get_first(seq);
1da177e4
LT
2267
2268 while (rc && pos) {
2269 rc = established_get_next(seq, rc);
2270 --pos;
7174259e 2271 }
1da177e4
LT
2272 return rc;
2273}
2274
2275static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2276{
2277 void *rc;
5799de0b 2278 struct tcp_iter_state *st = seq->private;
1da177e4 2279
1da177e4
LT
2280 st->state = TCP_SEQ_STATE_LISTENING;
2281 rc = listening_get_idx(seq, &pos);
2282
2283 if (!rc) {
1da177e4
LT
2284 st->state = TCP_SEQ_STATE_ESTABLISHED;
2285 rc = established_get_idx(seq, pos);
2286 }
2287
2288 return rc;
2289}
2290
a8b690f9
TH
2291static void *tcp_seek_last_pos(struct seq_file *seq)
2292{
2293 struct tcp_iter_state *st = seq->private;
2294 int offset = st->offset;
2295 int orig_num = st->num;
2296 void *rc = NULL;
2297
2298 switch (st->state) {
a8b690f9
TH
2299 case TCP_SEQ_STATE_LISTENING:
2300 if (st->bucket >= INET_LHTABLE_SIZE)
2301 break;
2302 st->state = TCP_SEQ_STATE_LISTENING;
2303 rc = listening_get_next(seq, NULL);
2304 while (offset-- && rc)
2305 rc = listening_get_next(seq, rc);
2306 if (rc)
2307 break;
2308 st->bucket = 0;
05dbc7b5 2309 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2310 /* Fallthrough */
2311 case TCP_SEQ_STATE_ESTABLISHED:
a8b690f9
TH
2312 if (st->bucket > tcp_hashinfo.ehash_mask)
2313 break;
2314 rc = established_get_first(seq);
2315 while (offset-- && rc)
2316 rc = established_get_next(seq, rc);
2317 }
2318
2319 st->num = orig_num;
2320
2321 return rc;
2322}
2323
37d849bb 2324void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1da177e4 2325{
5799de0b 2326 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2327 void *rc;
2328
2329 if (*pos && *pos == st->last_pos) {
2330 rc = tcp_seek_last_pos(seq);
2331 if (rc)
2332 goto out;
2333 }
2334
1da177e4
LT
2335 st->state = TCP_SEQ_STATE_LISTENING;
2336 st->num = 0;
a8b690f9
TH
2337 st->bucket = 0;
2338 st->offset = 0;
2339 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2340
2341out:
2342 st->last_pos = *pos;
2343 return rc;
1da177e4 2344}
37d849bb 2345EXPORT_SYMBOL(tcp_seq_start);
1da177e4 2346
37d849bb 2347void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1da177e4 2348{
a8b690f9 2349 struct tcp_iter_state *st = seq->private;
1da177e4 2350 void *rc = NULL;
1da177e4
LT
2351
2352 if (v == SEQ_START_TOKEN) {
2353 rc = tcp_get_idx(seq, 0);
2354 goto out;
2355 }
1da177e4
LT
2356
2357 switch (st->state) {
1da177e4
LT
2358 case TCP_SEQ_STATE_LISTENING:
2359 rc = listening_get_next(seq, v);
2360 if (!rc) {
1da177e4 2361 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2362 st->bucket = 0;
2363 st->offset = 0;
1da177e4
LT
2364 rc = established_get_first(seq);
2365 }
2366 break;
2367 case TCP_SEQ_STATE_ESTABLISHED:
1da177e4
LT
2368 rc = established_get_next(seq, v);
2369 break;
2370 }
2371out:
2372 ++*pos;
a8b690f9 2373 st->last_pos = *pos;
1da177e4
LT
2374 return rc;
2375}
37d849bb 2376EXPORT_SYMBOL(tcp_seq_next);
1da177e4 2377
37d849bb 2378void tcp_seq_stop(struct seq_file *seq, void *v)
1da177e4 2379{
5799de0b 2380 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2381
2382 switch (st->state) {
1da177e4
LT
2383 case TCP_SEQ_STATE_LISTENING:
2384 if (v != SEQ_START_TOKEN)
9652dc2e 2385 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4 2386 break;
1da177e4
LT
2387 case TCP_SEQ_STATE_ESTABLISHED:
2388 if (v)
9db66bdc 2389 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2390 break;
2391 }
2392}
37d849bb 2393EXPORT_SYMBOL(tcp_seq_stop);
1da177e4 2394
d4f06873 2395static void get_openreq4(const struct request_sock *req,
aa3a0c8c 2396 struct seq_file *f, int i)
1da177e4 2397{
2e6599cb 2398 const struct inet_request_sock *ireq = inet_rsk(req);
fa76ce73 2399 long delta = req->rsk_timer.expires - jiffies;
1da177e4 2400
5e659e4c 2401 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2402 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
1da177e4 2403 i,
634fb979 2404 ireq->ir_loc_addr,
d4f06873 2405 ireq->ir_num,
634fb979
ED
2406 ireq->ir_rmt_addr,
2407 ntohs(ireq->ir_rmt_port),
1da177e4
LT
2408 TCP_SYN_RECV,
2409 0, 0, /* could print option size, but that is af dependent. */
2410 1, /* timers active (only the expire timer) */
a399a805 2411 jiffies_delta_to_clock_t(delta),
e6c022a4 2412 req->num_timeout,
aa3a0c8c
ED
2413 from_kuid_munged(seq_user_ns(f),
2414 sock_i_uid(req->rsk_listener)),
1da177e4
LT
2415 0, /* non standard timer */
2416 0, /* open_requests have no inode */
d4f06873 2417 0,
652586df 2418 req);
1da177e4
LT
2419}
2420
652586df 2421static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
1da177e4
LT
2422{
2423 int timer_active;
2424 unsigned long timer_expires;
cf533ea5 2425 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2426 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2427 const struct inet_sock *inet = inet_sk(sk);
0536fcc0 2428 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2429 __be32 dest = inet->inet_daddr;
2430 __be32 src = inet->inet_rcv_saddr;
2431 __u16 destp = ntohs(inet->inet_dport);
2432 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2433 int rx_queue;
00fd38d9 2434 int state;
1da177e4 2435
6ba8a3b1 2436 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
57dde7f7 2437 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
6ba8a3b1 2438 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2439 timer_active = 1;
463c84b9
ACM
2440 timer_expires = icsk->icsk_timeout;
2441 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2442 timer_active = 4;
463c84b9 2443 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2444 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2445 timer_active = 2;
cf4c6bf8 2446 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2447 } else {
2448 timer_active = 0;
2449 timer_expires = jiffies;
2450 }
2451
986ffdfd 2452 state = inet_sk_state_load(sk);
00fd38d9 2453 if (state == TCP_LISTEN)
288efe86 2454 rx_queue = READ_ONCE(sk->sk_ack_backlog);
49d09007 2455 else
00fd38d9
ED
2456 /* Because we don't lock the socket,
2457 * we might find a transient negative value.
49d09007 2458 */
dba7d9b8 2459 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
7db48e98 2460 READ_ONCE(tp->copied_seq), 0);
49d09007 2461
5e659e4c 2462 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
652586df 2463 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
00fd38d9 2464 i, src, srcp, dest, destp, state,
0f317464 2465 READ_ONCE(tp->write_seq) - tp->snd_una,
49d09007 2466 rx_queue,
1da177e4 2467 timer_active,
a399a805 2468 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2469 icsk->icsk_retransmits,
a7cb5a49 2470 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2471 icsk->icsk_probes_out,
cf4c6bf8 2472 sock_i_ino(sk),
41c6d650 2473 refcount_read(&sk->sk_refcnt), sk,
7be87351
SH
2474 jiffies_to_clock_t(icsk->icsk_rto),
2475 jiffies_to_clock_t(icsk->icsk_ack.ato),
31954cd8 2476 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
1da177e4 2477 tp->snd_cwnd,
00fd38d9
ED
2478 state == TCP_LISTEN ?
2479 fastopenq->max_qlen :
652586df 2480 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
1da177e4
LT
2481}
2482
cf533ea5 2483static void get_timewait4_sock(const struct inet_timewait_sock *tw,
652586df 2484 struct seq_file *f, int i)
1da177e4 2485{
789f558c 2486 long delta = tw->tw_timer.expires - jiffies;
23f33c2d 2487 __be32 dest, src;
1da177e4 2488 __u16 destp, srcp;
1da177e4
LT
2489
2490 dest = tw->tw_daddr;
2491 src = tw->tw_rcv_saddr;
2492 destp = ntohs(tw->tw_dport);
2493 srcp = ntohs(tw->tw_sport);
2494
5e659e4c 2495 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2496 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
1da177e4 2497 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2498 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
41c6d650 2499 refcount_read(&tw->tw_refcnt), tw);
1da177e4
LT
2500}
2501
2502#define TMPSZ 150
2503
2504static int tcp4_seq_show(struct seq_file *seq, void *v)
2505{
5799de0b 2506 struct tcp_iter_state *st;
05dbc7b5 2507 struct sock *sk = v;
1da177e4 2508
652586df 2509 seq_setwidth(seq, TMPSZ - 1);
1da177e4 2510 if (v == SEQ_START_TOKEN) {
652586df 2511 seq_puts(seq, " sl local_address rem_address st tx_queue "
1da177e4
LT
2512 "rx_queue tr tm->when retrnsmt uid timeout "
2513 "inode");
2514 goto out;
2515 }
2516 st = seq->private;
2517
079096f1
ED
2518 if (sk->sk_state == TCP_TIME_WAIT)
2519 get_timewait4_sock(v, seq, st->num);
2520 else if (sk->sk_state == TCP_NEW_SYN_RECV)
aa3a0c8c 2521 get_openreq4(v, seq, st->num);
079096f1
ED
2522 else
2523 get_tcp4_sock(v, seq, st->num);
1da177e4 2524out:
652586df 2525 seq_pad(seq, '\n');
1da177e4
LT
2526 return 0;
2527}
2528
37d849bb
CH
2529static const struct seq_operations tcp4_seq_ops = {
2530 .show = tcp4_seq_show,
2531 .start = tcp_seq_start,
2532 .next = tcp_seq_next,
2533 .stop = tcp_seq_stop,
2534};
2535
1da177e4 2536static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4 2537 .family = AF_INET,
1da177e4
LT
2538};
2539
2c8c1e72 2540static int __net_init tcp4_proc_init_net(struct net *net)
757764f6 2541{
c3506372
CH
2542 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2543 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
37d849bb
CH
2544 return -ENOMEM;
2545 return 0;
757764f6
PE
2546}
2547
2c8c1e72 2548static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6 2549{
37d849bb 2550 remove_proc_entry("tcp", net->proc_net);
757764f6
PE
2551}
2552
2553static struct pernet_operations tcp4_net_ops = {
2554 .init = tcp4_proc_init_net,
2555 .exit = tcp4_proc_exit_net,
2556};
2557
1da177e4
LT
2558int __init tcp4_proc_init(void)
2559{
757764f6 2560 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2561}
2562
2563void tcp4_proc_exit(void)
2564{
757764f6 2565 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2566}
2567#endif /* CONFIG_PROC_FS */
2568
2569struct proto tcp_prot = {
2570 .name = "TCP",
2571 .owner = THIS_MODULE,
2572 .close = tcp_close,
d74bad4e 2573 .pre_connect = tcp_v4_pre_connect,
1da177e4
LT
2574 .connect = tcp_v4_connect,
2575 .disconnect = tcp_disconnect,
463c84b9 2576 .accept = inet_csk_accept,
1da177e4
LT
2577 .ioctl = tcp_ioctl,
2578 .init = tcp_v4_init_sock,
2579 .destroy = tcp_v4_destroy_sock,
2580 .shutdown = tcp_shutdown,
2581 .setsockopt = tcp_setsockopt,
2582 .getsockopt = tcp_getsockopt,
4b9d07a4 2583 .keepalive = tcp_set_keepalive,
1da177e4 2584 .recvmsg = tcp_recvmsg,
7ba42910
CG
2585 .sendmsg = tcp_sendmsg,
2586 .sendpage = tcp_sendpage,
1da177e4 2587 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 2588 .release_cb = tcp_release_cb,
ab1e0a13
ACM
2589 .hash = inet_hash,
2590 .unhash = inet_unhash,
2591 .get_port = inet_csk_get_port,
1da177e4 2592 .enter_memory_pressure = tcp_enter_memory_pressure,
06044751 2593 .leave_memory_pressure = tcp_leave_memory_pressure,
c9bee3b7 2594 .stream_memory_free = tcp_stream_memory_free,
1da177e4 2595 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2596 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2597 .memory_allocated = &tcp_memory_allocated,
2598 .memory_pressure = &tcp_memory_pressure,
a4fe34bf 2599 .sysctl_mem = sysctl_tcp_mem,
356d1833
ED
2600 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2601 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
1da177e4
LT
2602 .max_header = MAX_TCP_HEADER,
2603 .obj_size = sizeof(struct tcp_sock),
5f0d5a3a 2604 .slab_flags = SLAB_TYPESAFE_BY_RCU,
6d6ee43e 2605 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2606 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2607 .h.hashinfo = &tcp_hashinfo,
7ba42910 2608 .no_autobind = true,
543d9cfe
ACM
2609#ifdef CONFIG_COMPAT
2610 .compat_setsockopt = compat_tcp_setsockopt,
2611 .compat_getsockopt = compat_tcp_getsockopt,
d1a4c0b3 2612#endif
c1e64e29 2613 .diag_destroy = tcp_abort,
1da177e4 2614};
4bc2f18b 2615EXPORT_SYMBOL(tcp_prot);
1da177e4 2616
bdbbb852
ED
2617static void __net_exit tcp_sk_exit(struct net *net)
2618{
2619 int cpu;
2620
b506bc97
DL
2621 if (net->ipv4.tcp_congestion_control)
2622 module_put(net->ipv4.tcp_congestion_control->owner);
6670e152 2623
bdbbb852
ED
2624 for_each_possible_cpu(cpu)
2625 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2626 free_percpu(net->ipv4.tcp_sk);
2627}
2628
046ee902
DL
2629static int __net_init tcp_sk_init(struct net *net)
2630{
fee83d09 2631 int res, cpu, cnt;
bdbbb852
ED
2632
2633 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2634 if (!net->ipv4.tcp_sk)
2635 return -ENOMEM;
2636
2637 for_each_possible_cpu(cpu) {
2638 struct sock *sk;
2639
2640 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2641 IPPROTO_TCP, net);
2642 if (res)
2643 goto fail;
a9d6532b 2644 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
431280ee
ED
2645
2646 /* Please enforce IP_DF and IPID==0 for RST and
2647 * ACK sent in SYN-RECV and TIME-WAIT state.
2648 */
2649 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2650
bdbbb852
ED
2651 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2652 }
49213555 2653
5d134f1c 2654 net->ipv4.sysctl_tcp_ecn = 2;
49213555
DB
2655 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2656
b0f9ca53 2657 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
5f3e2bf0 2658 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
6b58e0a5 2659 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
05cbc0db 2660 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
c04b79b6 2661 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
046ee902 2662
13b287e8 2663 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
9bd6861b 2664 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
b840d15d 2665 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
13b287e8 2666
6fa25166 2667 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
7c083ecb 2668 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
0aca737d 2669 net->ipv4.sysctl_tcp_syncookies = 1;
1043e25f 2670 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
ae5c3f40 2671 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
c6214a97 2672 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
c402d9be 2673 net->ipv4.sysctl_tcp_orphan_retries = 0;
1e579caa 2674 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
4979f2d9 2675 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
79e9fed4 2676 net->ipv4.sysctl_tcp_tw_reuse = 2;
12ed8244 2677
fee83d09 2678 cnt = tcp_hashinfo.ehash_mask + 1;
743e4815 2679 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
1946e672
HY
2680 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2681
623d0c2d 2682 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
f9301034 2683 net->ipv4.sysctl_tcp_sack = 1;
9bb37ef0 2684 net->ipv4.sysctl_tcp_window_scaling = 1;
5d2ed052 2685 net->ipv4.sysctl_tcp_timestamps = 1;
2ae21cf5 2686 net->ipv4.sysctl_tcp_early_retrans = 3;
e20223f1 2687 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
b510f0d2 2688 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
e0a1e5b5 2689 net->ipv4.sysctl_tcp_retrans_collapse = 1;
c6e21803 2690 net->ipv4.sysctl_tcp_max_reordering = 300;
6496f6bd 2691 net->ipv4.sysctl_tcp_dsack = 1;
0c12654a 2692 net->ipv4.sysctl_tcp_app_win = 31;
94f0893e 2693 net->ipv4.sysctl_tcp_adv_win_scale = 1;
af9b69a7 2694 net->ipv4.sysctl_tcp_frto = 2;
4540c0cf 2695 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
d06a9904
ED
2696 /* This limits the percentage of the congestion window which we
2697 * will allow a single TSO frame to consume. Building TSO frames
2698 * which are too large can cause TCP streams to be bursty.
2699 */
2700 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
c73e5807
ED
2701 /* Default TSQ limit of 16 TSO segments */
2702 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
b530b681
ED
2703 /* rfc5961 challenge ack rate limiting */
2704 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
26e9596e 2705 net->ipv4.sysctl_tcp_min_tso_segs = 2;
bd239704 2706 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
790f00e1 2707 net->ipv4.sysctl_tcp_autocorking = 1;
4170ba6b 2708 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
23a7102a 2709 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
c26e91f8 2710 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
356d1833
ED
2711 if (net != &init_net) {
2712 memcpy(net->ipv4.sysctl_tcp_rmem,
2713 init_net.ipv4.sysctl_tcp_rmem,
2714 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2715 memcpy(net->ipv4.sysctl_tcp_wmem,
2716 init_net.ipv4.sysctl_tcp_wmem,
2717 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2718 }
6d82aa24 2719 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
9c21d2fc 2720 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
e1cfcbe8 2721 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
43713848 2722 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
3733be14
HY
2723 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2724 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
e1cfcbe8 2725
6670e152
SH
2726 /* Reno is always built in */
2727 if (!net_eq(net, &init_net) &&
2728 try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2729 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2730 else
2731 net->ipv4.tcp_congestion_control = &tcp_reno;
2732
49213555 2733 return 0;
bdbbb852
ED
2734fail:
2735 tcp_sk_exit(net);
2736
2737 return res;
b099ce26
EB
2738}
2739
2740static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2741{
43713848
HY
2742 struct net *net;
2743
1946e672 2744 inet_twsk_purge(&tcp_hashinfo, AF_INET);
43713848
HY
2745
2746 list_for_each_entry(net, net_exit_list, exit_list)
2747 tcp_fastopen_ctx_destroy(net);
046ee902
DL
2748}
2749
2750static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2751 .init = tcp_sk_init,
2752 .exit = tcp_sk_exit,
2753 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2754};
2755
9b0f976f 2756void __init tcp_v4_init(void)
1da177e4 2757{
6a1b3054 2758 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2759 panic("Failed to create the TCP control socket.\n");
1da177e4 2760}