ipv6: invert flowlabel sharing check in process and user mode
[linux-block.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
afd46503 53#define pr_fmt(fmt) "TCP: " fmt
1da177e4 54
eb4dea58 55#include <linux/bottom_half.h>
1da177e4
LT
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
5a0e3ad6 64#include <linux/slab.h>
1da177e4 65
457c4cbc 66#include <net/net_namespace.h>
1da177e4 67#include <net/icmp.h>
304a1618 68#include <net/inet_hashtables.h>
1da177e4 69#include <net/tcp.h>
20380731 70#include <net/transp_v6.h>
1da177e4
LT
71#include <net/ipv6.h>
72#include <net/inet_common.h>
6d6ee43e 73#include <net/timewait_sock.h>
1da177e4 74#include <net/xfrm.h>
6e5714ea 75#include <net/secure_seq.h>
076bb0c8 76#include <net/busy_poll.h>
1da177e4
LT
77
78#include <linux/inet.h>
79#include <linux/ipv6.h>
80#include <linux/stddef.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
6797318e 83#include <linux/inetdevice.h>
1da177e4 84
cf80e0e4 85#include <crypto/hash.h>
cfb6eeb4
YH
86#include <linux/scatterlist.h>
87
c24b14c4
SL
88#include <trace/events/tcp.h>
89
cfb6eeb4 90#ifdef CONFIG_TCP_MD5SIG
a915da9b 91static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 92 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
93#endif
94
5caea4ea 95struct inet_hashinfo tcp_hashinfo;
4bc2f18b 96EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 97
84b114b9 98static u32 tcp_v4_init_seq(const struct sk_buff *skb)
1da177e4 99{
84b114b9
ED
100 return secure_tcp_seq(ip_hdr(skb)->daddr,
101 ip_hdr(skb)->saddr,
102 tcp_hdr(skb)->dest,
103 tcp_hdr(skb)->source);
104}
105
5d2ed052 106static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
84b114b9 107{
5d2ed052 108 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
1da177e4
LT
109}
110
6d6ee43e
ACM
111int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112{
79e9fed4 113 const struct inet_timewait_sock *tw = inet_twsk(sktw);
6d6ee43e
ACM
114 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115 struct tcp_sock *tp = tcp_sk(sk);
79e9fed4
116 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
117
118 if (reuse == 2) {
119 /* Still does not detect *everything* that goes through
120 * lo, since we require a loopback src or dst address
121 * or direct binding to 'lo' interface.
122 */
123 bool loopback = false;
124 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
125 loopback = true;
126#if IS_ENABLED(CONFIG_IPV6)
127 if (tw->tw_family == AF_INET6) {
128 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
129 (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
130 (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
131 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
132 (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
133 (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
134 loopback = true;
135 } else
136#endif
137 {
138 if (ipv4_is_loopback(tw->tw_daddr) ||
139 ipv4_is_loopback(tw->tw_rcv_saddr))
140 loopback = true;
141 }
142 if (!loopback)
143 reuse = 0;
144 }
6d6ee43e
ACM
145
146 /* With PAWS, it is safe from the viewpoint
147 of data integrity. Even without PAWS it is safe provided sequence
148 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
149
150 Actually, the idea is close to VJ's one, only timestamp cache is
151 held not per host, but per port pair and TW bucket is used as state
152 holder.
153
154 If TW bucket has been already destroyed we fall back to VJ's scheme
155 and use initial timestamp retrieved from peer table.
156 */
157 if (tcptw->tw_ts_recent_stamp &&
cca9bab1
AB
158 (!twp || (reuse && time_after32(ktime_get_seconds(),
159 tcptw->tw_ts_recent_stamp)))) {
21684dc4
SB
160 /* In case of repair and re-using TIME-WAIT sockets we still
161 * want to be sure that it is safe as above but honor the
162 * sequence numbers and time stamps set as part of the repair
163 * process.
164 *
165 * Without this check re-using a TIME-WAIT socket with TCP
166 * repair would accumulate a -1 on the repair assigned
167 * sequence number. The first time it is reused the sequence
168 * is -1, the second time -2, etc. This fixes that issue
169 * without appearing to create any others.
170 */
171 if (likely(!tp->repair)) {
172 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
173 if (tp->write_seq == 0)
174 tp->write_seq = 1;
175 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
176 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
177 }
6d6ee43e
ACM
178 sock_hold(sktw);
179 return 1;
180 }
181
182 return 0;
183}
6d6ee43e
ACM
184EXPORT_SYMBOL_GPL(tcp_twsk_unique);
185
d74bad4e
AI
186static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
187 int addr_len)
188{
189 /* This check is replicated from tcp_v4_connect() and intended to
190 * prevent BPF program called below from accessing bytes that are out
191 * of the bound specified by user in addr_len.
192 */
193 if (addr_len < sizeof(struct sockaddr_in))
194 return -EINVAL;
195
196 sock_owned_by_me(sk);
197
198 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
199}
200
1da177e4
LT
201/* This will initiate an outgoing connection. */
202int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
203{
2d7192d6 204 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
205 struct inet_sock *inet = inet_sk(sk);
206 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 207 __be16 orig_sport, orig_dport;
bada8adc 208 __be32 daddr, nexthop;
da905bd1 209 struct flowi4 *fl4;
2d7192d6 210 struct rtable *rt;
1da177e4 211 int err;
f6d8bd05 212 struct ip_options_rcu *inet_opt;
1946e672 213 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
1da177e4
LT
214
215 if (addr_len < sizeof(struct sockaddr_in))
216 return -EINVAL;
217
218 if (usin->sin_family != AF_INET)
219 return -EAFNOSUPPORT;
220
221 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05 222 inet_opt = rcu_dereference_protected(inet->inet_opt,
1e1d04e6 223 lockdep_sock_is_held(sk));
f6d8bd05 224 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
225 if (!daddr)
226 return -EINVAL;
f6d8bd05 227 nexthop = inet_opt->opt.faddr;
1da177e4
LT
228 }
229
dca8b089
DM
230 orig_sport = inet->inet_sport;
231 orig_dport = usin->sin_port;
da905bd1
DM
232 fl4 = &inet->cork.fl.u.ip4;
233 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
234 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
235 IPPROTO_TCP,
0e0d44ab 236 orig_sport, orig_dport, sk);
b23dd4fe
DM
237 if (IS_ERR(rt)) {
238 err = PTR_ERR(rt);
239 if (err == -ENETUNREACH)
f1d8cba6 240 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 241 return err;
584bdf8c 242 }
1da177e4
LT
243
244 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
245 ip_rt_put(rt);
246 return -ENETUNREACH;
247 }
248
f6d8bd05 249 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 250 daddr = fl4->daddr;
1da177e4 251
c720c7e8 252 if (!inet->inet_saddr)
da905bd1 253 inet->inet_saddr = fl4->saddr;
d1e559d0 254 sk_rcv_saddr_set(sk, inet->inet_saddr);
1da177e4 255
c720c7e8 256 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
257 /* Reset inherited state */
258 tp->rx_opt.ts_recent = 0;
259 tp->rx_opt.ts_recent_stamp = 0;
ee995283
PE
260 if (likely(!tp->repair))
261 tp->write_seq = 0;
1da177e4
LT
262 }
263
c720c7e8 264 inet->inet_dport = usin->sin_port;
d1e559d0 265 sk_daddr_set(sk, daddr);
1da177e4 266
d83d8461 267 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
268 if (inet_opt)
269 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 270
bee7ca9e 271 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
272
273 /* Socket identity is still unknown (sport may be zero).
274 * However we set state to SYN-SENT and not releasing socket
275 * lock select source port, enter ourselves into the hash tables and
276 * complete initialization after this.
277 */
278 tcp_set_state(sk, TCP_SYN_SENT);
1946e672 279 err = inet_hash_connect(tcp_death_row, sk);
1da177e4
LT
280 if (err)
281 goto failure;
282
877d1f62 283 sk_set_txhash(sk);
9e7ceb06 284
da905bd1 285 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
286 inet->inet_sport, inet->inet_dport, sk);
287 if (IS_ERR(rt)) {
288 err = PTR_ERR(rt);
289 rt = NULL;
1da177e4 290 goto failure;
b23dd4fe 291 }
1da177e4 292 /* OK, now commit destination to socket. */
bcd76111 293 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 294 sk_setup_caps(sk, &rt->dst);
19f6d3f3 295 rt = NULL;
1da177e4 296
00355fa5 297 if (likely(!tp->repair)) {
00355fa5 298 if (!tp->write_seq)
84b114b9
ED
299 tp->write_seq = secure_tcp_seq(inet->inet_saddr,
300 inet->inet_daddr,
301 inet->inet_sport,
302 usin->sin_port);
5d2ed052
ED
303 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
304 inet->inet_saddr,
84b114b9 305 inet->inet_daddr);
00355fa5 306 }
1da177e4 307
c720c7e8 308 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4 309
19f6d3f3
WW
310 if (tcp_fastopen_defer_connect(sk, &err))
311 return err;
312 if (err)
313 goto failure;
314
2b916477 315 err = tcp_connect(sk);
ee995283 316
1da177e4
LT
317 if (err)
318 goto failure;
319
320 return 0;
321
322failure:
7174259e
ACM
323 /*
324 * This unhashes the socket and releases the local port,
325 * if necessary.
326 */
1da177e4
LT
327 tcp_set_state(sk, TCP_CLOSE);
328 ip_rt_put(rt);
329 sk->sk_route_caps = 0;
c720c7e8 330 inet->inet_dport = 0;
1da177e4
LT
331 return err;
332}
4bc2f18b 333EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 334
1da177e4 335/*
563d34d0
ED
336 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
337 * It can be called through tcp_release_cb() if socket was owned by user
338 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 339 */
4fab9071 340void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4 341{
1da177e4 342 struct inet_sock *inet = inet_sk(sk);
02b2faaf
ED
343 struct dst_entry *dst;
344 u32 mtu;
1da177e4 345
02b2faaf
ED
346 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
347 return;
348 mtu = tcp_sk(sk)->mtu_info;
80d0a69f
DM
349 dst = inet_csk_update_pmtu(sk, mtu);
350 if (!dst)
1da177e4
LT
351 return;
352
1da177e4
LT
353 /* Something is about to be wrong... Remember soft error
354 * for the case, if this connection will not able to recover.
355 */
356 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
357 sk->sk_err_soft = EMSGSIZE;
358
359 mtu = dst_mtu(dst);
360
361 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
482fc609 362 ip_sk_accept_pmtu(sk) &&
d83d8461 363 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
364 tcp_sync_mss(sk, mtu);
365
366 /* Resend the TCP packet because it's
367 * clear that the old packet has been
368 * dropped. This is the new "fast" path mtu
369 * discovery.
370 */
371 tcp_simple_retransmit(sk);
372 } /* else let the usual retransmit timer handle it */
373}
4fab9071 374EXPORT_SYMBOL(tcp_v4_mtu_reduced);
1da177e4 375
55be7a9c
DM
376static void do_redirect(struct sk_buff *skb, struct sock *sk)
377{
378 struct dst_entry *dst = __sk_dst_check(sk, 0);
379
1ed5c48f 380 if (dst)
6700c270 381 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
382}
383
26e37360
ED
384
385/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
9cf74903 386void tcp_req_err(struct sock *sk, u32 seq, bool abort)
26e37360
ED
387{
388 struct request_sock *req = inet_reqsk(sk);
389 struct net *net = sock_net(sk);
390
391 /* ICMPs are not backlogged, hence we cannot get
392 * an established socket here.
393 */
26e37360 394 if (seq != tcp_rsk(req)->snt_isn) {
02a1d6e7 395 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
9cf74903 396 } else if (abort) {
26e37360
ED
397 /*
398 * Still in SYN_RECV, just remove it silently.
399 * There is no good way to pass the error to the newly
400 * created socket, and POSIX does not want network
401 * errors returned from accept().
402 */
c6973669 403 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
9caad864 404 tcp_listendrop(req->rsk_listener);
26e37360 405 }
ef84d8ce 406 reqsk_put(req);
26e37360
ED
407}
408EXPORT_SYMBOL(tcp_req_err);
409
1da177e4
LT
410/*
411 * This routine is called by the ICMP module when it gets some
412 * sort of error condition. If err < 0 then the socket should
413 * be closed and the error returned to the user. If err > 0
414 * it's just the icmp type << 8 | icmp code. After adjustment
415 * header points to the first 8 bytes of the tcp header. We need
416 * to find the appropriate port.
417 *
418 * The locking strategy used here is very "optimistic". When
419 * someone else accesses the socket the ICMP is just dropped
420 * and for some paths there is no check at all.
421 * A more general error queue to queue errors for later handling
422 * is probably better.
423 *
424 */
425
32bbd879 426int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 427{
b71d1d42 428 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 429 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 430 struct inet_connection_sock *icsk;
1da177e4
LT
431 struct tcp_sock *tp;
432 struct inet_sock *inet;
4d1a2d9e
DL
433 const int type = icmp_hdr(icmp_skb)->type;
434 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 435 struct sock *sk;
f1ecd5d9 436 struct sk_buff *skb;
0a672f74 437 struct request_sock *fastopen;
9a568de4
ED
438 u32 seq, snd_una;
439 s32 remaining;
440 u32 delta_us;
1da177e4 441 int err;
4d1a2d9e 442 struct net *net = dev_net(icmp_skb->dev);
1da177e4 443
26e37360
ED
444 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
445 th->dest, iph->saddr, ntohs(th->source),
3fa6f616 446 inet_iif(icmp_skb), 0);
1da177e4 447 if (!sk) {
5d3848bc 448 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
32bbd879 449 return -ENOENT;
1da177e4
LT
450 }
451 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 452 inet_twsk_put(inet_twsk(sk));
32bbd879 453 return 0;
1da177e4 454 }
26e37360 455 seq = ntohl(th->seq);
32bbd879
SB
456 if (sk->sk_state == TCP_NEW_SYN_RECV) {
457 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
458 type == ICMP_TIME_EXCEEDED ||
459 (type == ICMP_DEST_UNREACH &&
460 (code == ICMP_NET_UNREACH ||
461 code == ICMP_HOST_UNREACH)));
462 return 0;
463 }
1da177e4
LT
464
465 bh_lock_sock(sk);
466 /* If too many ICMPs get dropped on busy
467 * servers this needs to be solved differently.
563d34d0
ED
468 * We do take care of PMTU discovery (RFC1191) special case :
469 * we can receive locally generated ICMP messages while socket is held.
1da177e4 470 */
b74aa930
ED
471 if (sock_owned_by_user(sk)) {
472 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
02a1d6e7 473 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
b74aa930 474 }
1da177e4
LT
475 if (sk->sk_state == TCP_CLOSE)
476 goto out;
477
97e3ecd1 478 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 479 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
97e3ecd1 480 goto out;
481 }
482
f1ecd5d9 483 icsk = inet_csk(sk);
1da177e4 484 tp = tcp_sk(sk);
0a672f74
YC
485 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
486 fastopen = tp->fastopen_rsk;
487 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
1da177e4 488 if (sk->sk_state != TCP_LISTEN &&
0a672f74 489 !between(seq, snd_una, tp->snd_nxt)) {
02a1d6e7 490 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
491 goto out;
492 }
493
494 switch (type) {
55be7a9c 495 case ICMP_REDIRECT:
45caeaa5
JM
496 if (!sock_owned_by_user(sk))
497 do_redirect(icmp_skb, sk);
55be7a9c 498 goto out;
1da177e4
LT
499 case ICMP_SOURCE_QUENCH:
500 /* Just silently ignore these. */
501 goto out;
502 case ICMP_PARAMETERPROB:
503 err = EPROTO;
504 break;
505 case ICMP_DEST_UNREACH:
506 if (code > NR_ICMP_UNREACH)
507 goto out;
508
509 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
510 /* We are not interested in TCP_LISTEN and open_requests
511 * (SYN-ACKs send out by Linux are always <576bytes so
512 * they should go through unfragmented).
513 */
514 if (sk->sk_state == TCP_LISTEN)
515 goto out;
516
563d34d0 517 tp->mtu_info = info;
144d56e9 518 if (!sock_owned_by_user(sk)) {
563d34d0 519 tcp_v4_mtu_reduced(sk);
144d56e9 520 } else {
7aa5470c 521 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
144d56e9
ED
522 sock_hold(sk);
523 }
1da177e4
LT
524 goto out;
525 }
526
527 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
528 /* check if icmp_skb allows revert of backoff
529 * (see draft-zimmermann-tcp-lcd) */
530 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
531 break;
532 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
0a672f74 533 !icsk->icsk_backoff || fastopen)
f1ecd5d9
DL
534 break;
535
8f49c270
DM
536 if (sock_owned_by_user(sk))
537 break;
538
2c4cc971
ED
539 skb = tcp_rtx_queue_head(sk);
540 if (WARN_ON_ONCE(!skb))
541 break;
542
f1ecd5d9 543 icsk->icsk_backoff--;
fcdd1cf4
ED
544 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
545 TCP_TIMEOUT_INIT;
546 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
f1ecd5d9 547
f1ecd5d9 548
9a568de4 549 tcp_mstamp_refresh(tp);
2fd66ffb 550 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
7faee5c0 551 remaining = icsk->icsk_rto -
9a568de4 552 usecs_to_jiffies(delta_us);
f1ecd5d9 553
9a568de4 554 if (remaining > 0) {
f1ecd5d9
DL
555 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
556 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
557 } else {
558 /* RTO revert clocked out retransmission.
559 * Will retransmit now */
560 tcp_retransmit_timer(sk);
561 }
562
1da177e4
LT
563 break;
564 case ICMP_TIME_EXCEEDED:
565 err = EHOSTUNREACH;
566 break;
567 default:
568 goto out;
569 }
570
571 switch (sk->sk_state) {
1da177e4 572 case TCP_SYN_SENT:
0a672f74
YC
573 case TCP_SYN_RECV:
574 /* Only in fast or simultaneous open. If a fast open socket is
575 * is already accepted it is treated as a connected one below.
576 */
51456b29 577 if (fastopen && !fastopen->sk)
0a672f74
YC
578 break;
579
1da177e4 580 if (!sock_owned_by_user(sk)) {
1da177e4
LT
581 sk->sk_err = err;
582
583 sk->sk_error_report(sk);
584
585 tcp_done(sk);
586 } else {
587 sk->sk_err_soft = err;
588 }
589 goto out;
590 }
591
592 /* If we've already connected we will keep trying
593 * until we time out, or the user gives up.
594 *
595 * rfc1122 4.2.3.9 allows to consider as hard errors
596 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
597 * but it is obsoleted by pmtu discovery).
598 *
599 * Note, that in modern internet, where routing is unreliable
600 * and in each dark corner broken firewalls sit, sending random
601 * errors ordered by their masters even this two messages finally lose
602 * their original sense (even Linux sends invalid PORT_UNREACHs)
603 *
604 * Now we are in compliance with RFCs.
605 * --ANK (980905)
606 */
607
608 inet = inet_sk(sk);
609 if (!sock_owned_by_user(sk) && inet->recverr) {
610 sk->sk_err = err;
611 sk->sk_error_report(sk);
612 } else { /* Only an error on timeout */
613 sk->sk_err_soft = err;
614 }
615
616out:
617 bh_unlock_sock(sk);
618 sock_put(sk);
32bbd879 619 return 0;
1da177e4
LT
620}
621
28850dc7 622void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 623{
aa8223c7 624 struct tcphdr *th = tcp_hdr(skb);
1da177e4 625
98be9b12
ED
626 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
627 skb->csum_start = skb_transport_header(skb) - skb->head;
628 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4
LT
629}
630
419f9f89 631/* This routine computes an IPv4 TCP checksum. */
bb296246 632void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 633{
cf533ea5 634 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
635
636 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
637}
4bc2f18b 638EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 639
1da177e4
LT
640/*
641 * This routine will send an RST to the other tcp.
642 *
643 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
644 * for reset.
645 * Answer: if a packet caused RST, it is not for a socket
646 * existing in our system, if it is matched to a socket,
647 * it is just duplicate segment or bug in other side's TCP.
648 * So that we build reply only basing on parameters
649 * arrived with segment.
650 * Exception: precedence violation. We do not implement it in any case.
651 */
652
a00e7444 653static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
1da177e4 654{
cf533ea5 655 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
656 struct {
657 struct tcphdr th;
658#ifdef CONFIG_TCP_MD5SIG
714e85be 659 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
660#endif
661 } rep;
1da177e4 662 struct ip_reply_arg arg;
cfb6eeb4 663#ifdef CONFIG_TCP_MD5SIG
e46787f0 664 struct tcp_md5sig_key *key = NULL;
658ddaaf
SL
665 const __u8 *hash_location = NULL;
666 unsigned char newhash[16];
667 int genhash;
668 struct sock *sk1 = NULL;
cfb6eeb4 669#endif
a86b1e30 670 struct net *net;
00483690 671 struct sock *ctl_sk;
1da177e4
LT
672
673 /* Never send a reset in response to a reset. */
674 if (th->rst)
675 return;
676
c3658e8d
ED
677 /* If sk not NULL, it means we did a successful lookup and incoming
678 * route had to be correct. prequeue might have dropped our dst.
679 */
680 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
681 return;
682
683 /* Swap the send and the receive. */
cfb6eeb4
YH
684 memset(&rep, 0, sizeof(rep));
685 rep.th.dest = th->source;
686 rep.th.source = th->dest;
687 rep.th.doff = sizeof(struct tcphdr) / 4;
688 rep.th.rst = 1;
1da177e4
LT
689
690 if (th->ack) {
cfb6eeb4 691 rep.th.seq = th->ack_seq;
1da177e4 692 } else {
cfb6eeb4
YH
693 rep.th.ack = 1;
694 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
695 skb->len - (th->doff << 2));
1da177e4
LT
696 }
697
7174259e 698 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
699 arg.iov[0].iov_base = (unsigned char *)&rep;
700 arg.iov[0].iov_len = sizeof(rep.th);
701
0f85feae 702 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
cfb6eeb4 703#ifdef CONFIG_TCP_MD5SIG
3b24d854 704 rcu_read_lock();
658ddaaf 705 hash_location = tcp_parse_md5sig_option(th);
271c3b9b 706 if (sk && sk_fullsock(sk)) {
e46787f0
FW
707 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
708 &ip_hdr(skb)->saddr, AF_INET);
709 } else if (hash_location) {
658ddaaf
SL
710 /*
711 * active side is lost. Try to find listening socket through
712 * source port, and then find md5 key through listening socket.
713 * we are not loose security here:
714 * Incoming packet is checked with md5 hash with finding key,
715 * no RST generated if md5 hash doesn't match.
716 */
a583636a
CG
717 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
718 ip_hdr(skb)->saddr,
da5e3630 719 th->source, ip_hdr(skb)->daddr,
3fa6f616
DA
720 ntohs(th->source), inet_iif(skb),
721 tcp_v4_sdif(skb));
658ddaaf
SL
722 /* don't send rst if it can't find key */
723 if (!sk1)
3b24d854
ED
724 goto out;
725
658ddaaf
SL
726 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
727 &ip_hdr(skb)->saddr, AF_INET);
728 if (!key)
3b24d854
ED
729 goto out;
730
658ddaaf 731
39f8e58e 732 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
658ddaaf 733 if (genhash || memcmp(hash_location, newhash, 16) != 0)
3b24d854
ED
734 goto out;
735
658ddaaf
SL
736 }
737
cfb6eeb4
YH
738 if (key) {
739 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
740 (TCPOPT_NOP << 16) |
741 (TCPOPT_MD5SIG << 8) |
742 TCPOLEN_MD5SIG);
743 /* Update length and the length the header thinks exists */
744 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
745 rep.th.doff = arg.iov[0].iov_len / 4;
746
49a72dfb 747 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
748 key, ip_hdr(skb)->saddr,
749 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
750 }
751#endif
eddc9ec5
ACM
752 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
753 ip_hdr(skb)->saddr, /* XXX */
52cd5750 754 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 755 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
271c3b9b
FW
756 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
757
e2446eaa 758 /* When socket is gone, all binding information is lost.
4c675258
AK
759 * routing might fail in this case. No choice here, if we choose to force
760 * input interface, we will misroute in case of asymmetric route.
e2446eaa 761 */
c24b14c4 762 if (sk) {
4c675258 763 arg.bound_dev_if = sk->sk_bound_dev_if;
5c487bb9
SL
764 if (sk_fullsock(sk))
765 trace_tcp_send_reset(sk, skb);
c24b14c4 766 }
1da177e4 767
271c3b9b
FW
768 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
769 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
770
66b13d99 771 arg.tos = ip_hdr(skb)->tos;
e2d118a1 772 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
47dcc20a 773 local_bh_disable();
00483690
JM
774 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
775 if (sk)
776 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
777 inet_twsk(sk)->tw_mark : sk->sk_mark;
778 ip_send_unicast_reply(ctl_sk,
bdbbb852 779 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d
ED
780 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
781 &arg, arg.iov[0].iov_len);
1da177e4 782
00483690 783 ctl_sk->sk_mark = 0;
90bbcc60
ED
784 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
785 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
47dcc20a 786 local_bh_enable();
658ddaaf
SL
787
788#ifdef CONFIG_TCP_MD5SIG
3b24d854
ED
789out:
790 rcu_read_unlock();
658ddaaf 791#endif
1da177e4
LT
792}
793
794/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
795 outside socket context is ugly, certainly. What can I do?
796 */
797
e2d118a1 798static void tcp_v4_send_ack(const struct sock *sk,
e62a123b 799 struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 800 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 801 struct tcp_md5sig_key *key,
66b13d99 802 int reply_flags, u8 tos)
1da177e4 803{
cf533ea5 804 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
805 struct {
806 struct tcphdr th;
714e85be 807 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 808#ifdef CONFIG_TCP_MD5SIG
714e85be 809 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
810#endif
811 ];
1da177e4 812 } rep;
e2d118a1 813 struct net *net = sock_net(sk);
1da177e4 814 struct ip_reply_arg arg;
00483690 815 struct sock *ctl_sk;
1da177e4
LT
816
817 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 818 memset(&arg, 0, sizeof(arg));
1da177e4
LT
819
820 arg.iov[0].iov_base = (unsigned char *)&rep;
821 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 822 if (tsecr) {
cfb6eeb4
YH
823 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
824 (TCPOPT_TIMESTAMP << 8) |
825 TCPOLEN_TIMESTAMP);
ee684b6f
AV
826 rep.opt[1] = htonl(tsval);
827 rep.opt[2] = htonl(tsecr);
cb48cfe8 828 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
829 }
830
831 /* Swap the send and the receive. */
832 rep.th.dest = th->source;
833 rep.th.source = th->dest;
834 rep.th.doff = arg.iov[0].iov_len / 4;
835 rep.th.seq = htonl(seq);
836 rep.th.ack_seq = htonl(ack);
837 rep.th.ack = 1;
838 rep.th.window = htons(win);
839
cfb6eeb4 840#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 841 if (key) {
ee684b6f 842 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
843
844 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
845 (TCPOPT_NOP << 16) |
846 (TCPOPT_MD5SIG << 8) |
847 TCPOLEN_MD5SIG);
848 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
849 rep.th.doff = arg.iov[0].iov_len/4;
850
49a72dfb 851 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
852 key, ip_hdr(skb)->saddr,
853 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
854 }
855#endif
88ef4a5a 856 arg.flags = reply_flags;
eddc9ec5
ACM
857 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
858 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
859 arg.iov[0].iov_len, IPPROTO_TCP, 0);
860 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
861 if (oif)
862 arg.bound_dev_if = oif;
66b13d99 863 arg.tos = tos;
e2d118a1 864 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
47dcc20a 865 local_bh_disable();
00483690
JM
866 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
867 if (sk)
868 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
869 inet_twsk(sk)->tw_mark : sk->sk_mark;
870 ip_send_unicast_reply(ctl_sk,
bdbbb852 871 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d
ED
872 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
873 &arg, arg.iov[0].iov_len);
1da177e4 874
00483690 875 ctl_sk->sk_mark = 0;
90bbcc60 876 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
47dcc20a 877 local_bh_enable();
1da177e4
LT
878}
879
880static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
881{
8feaf0c0 882 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 883 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 884
e2d118a1 885 tcp_v4_send_ack(sk, skb,
e62a123b 886 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 887 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
9a568de4 888 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
9501f972
YH
889 tcptw->tw_ts_recent,
890 tw->tw_bound_dev_if,
88ef4a5a 891 tcp_twsk_md5_key(tcptw),
66b13d99
ED
892 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
893 tw->tw_tos
9501f972 894 );
1da177e4 895
8feaf0c0 896 inet_twsk_put(tw);
1da177e4
LT
897}
898
a00e7444 899static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
7174259e 900 struct request_sock *req)
1da177e4 901{
168a8f58
JC
902 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
903 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
904 */
e62a123b
ED
905 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
906 tcp_sk(sk)->snd_nxt;
907
20a2b49f
ED
908 /* RFC 7323 2.3
909 * The window field (SEG.WND) of every outgoing segment, with the
910 * exception of <SYN> segments, MUST be right-shifted by
911 * Rcv.Wind.Shift bits:
912 */
e2d118a1 913 tcp_v4_send_ack(sk, skb, seq,
20a2b49f
ED
914 tcp_rsk(req)->rcv_nxt,
915 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
9a568de4 916 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
9501f972
YH
917 req->ts_recent,
918 0,
30791ac4 919 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
a915da9b 920 AF_INET),
66b13d99
ED
921 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
922 ip_hdr(skb)->tos);
1da177e4
LT
923}
924
1da177e4 925/*
9bf1d83e 926 * Send a SYN-ACK after having received a SYN.
60236fdd 927 * This still operates on a request_sock only, not on a big
1da177e4
LT
928 * socket.
929 */
0f935dbe 930static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
d6274bd8 931 struct flowi *fl,
72659ecc 932 struct request_sock *req,
ca6fb065 933 struct tcp_fastopen_cookie *foc,
b3d05147 934 enum tcp_synack_type synack_type)
1da177e4 935{
2e6599cb 936 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 937 struct flowi4 fl4;
1da177e4 938 int err = -1;
d41db5af 939 struct sk_buff *skb;
1da177e4
LT
940
941 /* First, grab a route. */
ba3f7f04 942 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 943 return -1;
1da177e4 944
b3d05147 945 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
1da177e4
LT
946
947 if (skb) {
634fb979 948 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1da177e4 949
2ab2ddd3 950 rcu_read_lock();
634fb979
ED
951 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
952 ireq->ir_rmt_addr,
2ab2ddd3
ED
953 rcu_dereference(ireq->ireq_opt));
954 rcu_read_unlock();
b9df3cb8 955 err = net_xmit_eval(err);
1da177e4
LT
956 }
957
1da177e4
LT
958 return err;
959}
960
961/*
60236fdd 962 * IPv4 request_sock destructor.
1da177e4 963 */
60236fdd 964static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 965{
c92e8c02 966 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1da177e4
LT
967}
968
cfb6eeb4
YH
969#ifdef CONFIG_TCP_MD5SIG
970/*
971 * RFC2385 MD5 checksumming requires a mapping of
972 * IP address->MD5 Key.
973 * We need to maintain these in the sk structure.
974 */
975
921f9a0f 976DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
6015c71e
ED
977EXPORT_SYMBOL(tcp_md5_needed);
978
cfb6eeb4 979/* Find the Key structure for an address. */
6015c71e
ED
980struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
981 const union tcp_md5_addr *addr,
982 int family)
cfb6eeb4 983{
fd3a154a 984 const struct tcp_sock *tp = tcp_sk(sk);
a915da9b 985 struct tcp_md5sig_key *key;
fd3a154a 986 const struct tcp_md5sig_info *md5sig;
6797318e
ID
987 __be32 mask;
988 struct tcp_md5sig_key *best_match = NULL;
989 bool match;
cfb6eeb4 990
a8afca03
ED
991 /* caller either holds rcu_read_lock() or socket lock */
992 md5sig = rcu_dereference_check(tp->md5sig_info,
1e1d04e6 993 lockdep_sock_is_held(sk));
a8afca03 994 if (!md5sig)
cfb6eeb4 995 return NULL;
083a0326 996
b67bfe0d 997 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
a915da9b
ED
998 if (key->family != family)
999 continue;
6797318e
ID
1000
1001 if (family == AF_INET) {
1002 mask = inet_make_mask(key->prefixlen);
1003 match = (key->addr.a4.s_addr & mask) ==
1004 (addr->a4.s_addr & mask);
1005#if IS_ENABLED(CONFIG_IPV6)
1006 } else if (family == AF_INET6) {
1007 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1008 key->prefixlen);
1009#endif
1010 } else {
1011 match = false;
1012 }
1013
1014 if (match && (!best_match ||
1015 key->prefixlen > best_match->prefixlen))
1016 best_match = key;
1017 }
1018 return best_match;
1019}
6015c71e 1020EXPORT_SYMBOL(__tcp_md5_do_lookup);
6797318e 1021
e8f37d57
WF
1022static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1023 const union tcp_md5_addr *addr,
1024 int family, u8 prefixlen)
6797318e
ID
1025{
1026 const struct tcp_sock *tp = tcp_sk(sk);
1027 struct tcp_md5sig_key *key;
1028 unsigned int size = sizeof(struct in_addr);
1029 const struct tcp_md5sig_info *md5sig;
1030
1031 /* caller either holds rcu_read_lock() or socket lock */
1032 md5sig = rcu_dereference_check(tp->md5sig_info,
1033 lockdep_sock_is_held(sk));
1034 if (!md5sig)
1035 return NULL;
1036#if IS_ENABLED(CONFIG_IPV6)
1037 if (family == AF_INET6)
1038 size = sizeof(struct in6_addr);
1039#endif
1040 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1041 if (key->family != family)
1042 continue;
1043 if (!memcmp(&key->addr, addr, size) &&
1044 key->prefixlen == prefixlen)
a915da9b 1045 return key;
cfb6eeb4
YH
1046 }
1047 return NULL;
1048}
1049
b83e3deb 1050struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
fd3a154a 1051 const struct sock *addr_sk)
cfb6eeb4 1052{
b52e6921 1053 const union tcp_md5_addr *addr;
a915da9b 1054
b52e6921 1055 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
a915da9b 1056 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4 1057}
cfb6eeb4
YH
1058EXPORT_SYMBOL(tcp_v4_md5_lookup);
1059
cfb6eeb4 1060/* This can be called on a newly created socket, from other files */
a915da9b 1061int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
6797318e
ID
1062 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1063 gfp_t gfp)
cfb6eeb4
YH
1064{
1065 /* Add Key to the list */
b0a713e9 1066 struct tcp_md5sig_key *key;
cfb6eeb4 1067 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1068 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1069
6797318e 1070 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
cfb6eeb4
YH
1071 if (key) {
1072 /* Pre-existing entry - just update that one. */
a915da9b 1073 memcpy(key->key, newkey, newkeylen);
b0a713e9 1074 key->keylen = newkeylen;
a915da9b
ED
1075 return 0;
1076 }
260fcbeb 1077
a8afca03 1078 md5sig = rcu_dereference_protected(tp->md5sig_info,
1e1d04e6 1079 lockdep_sock_is_held(sk));
a915da9b
ED
1080 if (!md5sig) {
1081 md5sig = kmalloc(sizeof(*md5sig), gfp);
1082 if (!md5sig)
cfb6eeb4 1083 return -ENOMEM;
cfb6eeb4 1084
a915da9b
ED
1085 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1086 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 1087 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 1088 }
cfb6eeb4 1089
5f3d9cb2 1090 key = sock_kmalloc(sk, sizeof(*key), gfp);
a915da9b
ED
1091 if (!key)
1092 return -ENOMEM;
71cea17e 1093 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 1094 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 1095 return -ENOMEM;
cfb6eeb4 1096 }
a915da9b
ED
1097
1098 memcpy(key->key, newkey, newkeylen);
1099 key->keylen = newkeylen;
1100 key->family = family;
6797318e 1101 key->prefixlen = prefixlen;
a915da9b
ED
1102 memcpy(&key->addr, addr,
1103 (family == AF_INET6) ? sizeof(struct in6_addr) :
1104 sizeof(struct in_addr));
1105 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
1106 return 0;
1107}
a915da9b 1108EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 1109
6797318e
ID
1110int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1111 u8 prefixlen)
cfb6eeb4 1112{
a915da9b
ED
1113 struct tcp_md5sig_key *key;
1114
6797318e 1115 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
a915da9b
ED
1116 if (!key)
1117 return -ENOENT;
1118 hlist_del_rcu(&key->node);
5f3d9cb2 1119 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1120 kfree_rcu(key, rcu);
a915da9b 1121 return 0;
cfb6eeb4 1122}
a915da9b 1123EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 1124
e0683e70 1125static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
1126{
1127 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1128 struct tcp_md5sig_key *key;
b67bfe0d 1129 struct hlist_node *n;
a8afca03 1130 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1131
a8afca03
ED
1132 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1133
b67bfe0d 1134 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1135 hlist_del_rcu(&key->node);
5f3d9cb2 1136 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1137 kfree_rcu(key, rcu);
cfb6eeb4
YH
1138 }
1139}
1140
8917a777
ID
1141static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1142 char __user *optval, int optlen)
cfb6eeb4
YH
1143{
1144 struct tcp_md5sig cmd;
1145 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
8917a777 1146 u8 prefixlen = 32;
cfb6eeb4
YH
1147
1148 if (optlen < sizeof(cmd))
1149 return -EINVAL;
1150
7174259e 1151 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1152 return -EFAULT;
1153
1154 if (sin->sin_family != AF_INET)
1155 return -EINVAL;
1156
8917a777
ID
1157 if (optname == TCP_MD5SIG_EXT &&
1158 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1159 prefixlen = cmd.tcpm_prefixlen;
1160 if (prefixlen > 32)
1161 return -EINVAL;
1162 }
1163
64a124ed 1164 if (!cmd.tcpm_keylen)
a915da9b 1165 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
8917a777 1166 AF_INET, prefixlen);
cfb6eeb4
YH
1167
1168 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1169 return -EINVAL;
1170
a915da9b 1171 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
8917a777 1172 AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
a915da9b 1173 GFP_KERNEL);
cfb6eeb4
YH
1174}
1175
19689e38
ED
1176static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1177 __be32 daddr, __be32 saddr,
1178 const struct tcphdr *th, int nbytes)
cfb6eeb4 1179{
cfb6eeb4 1180 struct tcp4_pseudohdr *bp;
49a72dfb 1181 struct scatterlist sg;
19689e38 1182 struct tcphdr *_th;
cfb6eeb4 1183
19689e38 1184 bp = hp->scratch;
cfb6eeb4
YH
1185 bp->saddr = saddr;
1186 bp->daddr = daddr;
1187 bp->pad = 0;
076fb722 1188 bp->protocol = IPPROTO_TCP;
49a72dfb 1189 bp->len = cpu_to_be16(nbytes);
c7da57a1 1190
19689e38
ED
1191 _th = (struct tcphdr *)(bp + 1);
1192 memcpy(_th, th, sizeof(*th));
1193 _th->check = 0;
1194
1195 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1196 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1197 sizeof(*bp) + sizeof(*th));
cf80e0e4 1198 return crypto_ahash_update(hp->md5_req);
49a72dfb
AL
1199}
1200
a915da9b 1201static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1202 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1203{
1204 struct tcp_md5sig_pool *hp;
cf80e0e4 1205 struct ahash_request *req;
49a72dfb
AL
1206
1207 hp = tcp_get_md5sig_pool();
1208 if (!hp)
1209 goto clear_hash_noput;
cf80e0e4 1210 req = hp->md5_req;
49a72dfb 1211
cf80e0e4 1212 if (crypto_ahash_init(req))
49a72dfb 1213 goto clear_hash;
19689e38 1214 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
49a72dfb
AL
1215 goto clear_hash;
1216 if (tcp_md5_hash_key(hp, key))
1217 goto clear_hash;
cf80e0e4
HX
1218 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1219 if (crypto_ahash_final(req))
cfb6eeb4
YH
1220 goto clear_hash;
1221
cfb6eeb4 1222 tcp_put_md5sig_pool();
cfb6eeb4 1223 return 0;
49a72dfb 1224
cfb6eeb4
YH
1225clear_hash:
1226 tcp_put_md5sig_pool();
1227clear_hash_noput:
1228 memset(md5_hash, 0, 16);
49a72dfb 1229 return 1;
cfb6eeb4
YH
1230}
1231
39f8e58e
ED
1232int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1233 const struct sock *sk,
318cf7aa 1234 const struct sk_buff *skb)
cfb6eeb4 1235{
49a72dfb 1236 struct tcp_md5sig_pool *hp;
cf80e0e4 1237 struct ahash_request *req;
318cf7aa 1238 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1239 __be32 saddr, daddr;
1240
39f8e58e
ED
1241 if (sk) { /* valid for establish/request sockets */
1242 saddr = sk->sk_rcv_saddr;
1243 daddr = sk->sk_daddr;
cfb6eeb4 1244 } else {
49a72dfb
AL
1245 const struct iphdr *iph = ip_hdr(skb);
1246 saddr = iph->saddr;
1247 daddr = iph->daddr;
cfb6eeb4 1248 }
49a72dfb
AL
1249
1250 hp = tcp_get_md5sig_pool();
1251 if (!hp)
1252 goto clear_hash_noput;
cf80e0e4 1253 req = hp->md5_req;
49a72dfb 1254
cf80e0e4 1255 if (crypto_ahash_init(req))
49a72dfb
AL
1256 goto clear_hash;
1257
19689e38 1258 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
49a72dfb
AL
1259 goto clear_hash;
1260 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1261 goto clear_hash;
1262 if (tcp_md5_hash_key(hp, key))
1263 goto clear_hash;
cf80e0e4
HX
1264 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1265 if (crypto_ahash_final(req))
49a72dfb
AL
1266 goto clear_hash;
1267
1268 tcp_put_md5sig_pool();
1269 return 0;
1270
1271clear_hash:
1272 tcp_put_md5sig_pool();
1273clear_hash_noput:
1274 memset(md5_hash, 0, 16);
1275 return 1;
cfb6eeb4 1276}
49a72dfb 1277EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1278
ba8e275a
ED
1279#endif
1280
ff74e23f 1281/* Called with rcu_read_lock() */
ba8e275a 1282static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
ff74e23f 1283 const struct sk_buff *skb)
cfb6eeb4 1284{
ba8e275a 1285#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4
YH
1286 /*
1287 * This gets called for each TCP segment that arrives
1288 * so we want to be efficient.
1289 * We have 3 drop cases:
1290 * o No MD5 hash and one expected.
1291 * o MD5 hash and we're not expecting one.
1292 * o MD5 hash and its wrong.
1293 */
cf533ea5 1294 const __u8 *hash_location = NULL;
cfb6eeb4 1295 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1296 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1297 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1298 int genhash;
cfb6eeb4
YH
1299 unsigned char newhash[16];
1300
a915da9b
ED
1301 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1302 AF_INET);
7d5d5525 1303 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1304
cfb6eeb4
YH
1305 /* We've parsed the options - do we have a hash? */
1306 if (!hash_expected && !hash_location)
a2a385d6 1307 return false;
cfb6eeb4
YH
1308
1309 if (hash_expected && !hash_location) {
c10d9310 1310 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1311 return true;
cfb6eeb4
YH
1312 }
1313
1314 if (!hash_expected && hash_location) {
c10d9310 1315 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1316 return true;
cfb6eeb4
YH
1317 }
1318
1319 /* Okay, so this is hash_expected and hash_location -
1320 * so we need to calculate the checksum.
1321 */
49a72dfb
AL
1322 genhash = tcp_v4_md5_hash_skb(newhash,
1323 hash_expected,
39f8e58e 1324 NULL, skb);
cfb6eeb4
YH
1325
1326 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
72145a68 1327 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
e87cc472
JP
1328 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1329 &iph->saddr, ntohs(th->source),
1330 &iph->daddr, ntohs(th->dest),
1331 genhash ? " tcp_v4_calc_md5_hash failed"
1332 : "");
a2a385d6 1333 return true;
cfb6eeb4 1334 }
a2a385d6 1335 return false;
cfb6eeb4 1336#endif
ba8e275a
ED
1337 return false;
1338}
cfb6eeb4 1339
b40cf18e
ED
1340static void tcp_v4_init_req(struct request_sock *req,
1341 const struct sock *sk_listener,
16bea70a
OP
1342 struct sk_buff *skb)
1343{
1344 struct inet_request_sock *ireq = inet_rsk(req);
c92e8c02 1345 struct net *net = sock_net(sk_listener);
16bea70a 1346
08d2cc3b
ED
1347 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1348 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
c92e8c02 1349 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
16bea70a
OP
1350}
1351
f964629e
ED
1352static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1353 struct flowi *fl,
4396e461 1354 const struct request_sock *req)
d94e0417 1355{
4396e461 1356 return inet_csk_route_req(sk, &fl->u.ip4, req);
d94e0417
OP
1357}
1358
72a3effa 1359struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1360 .family = PF_INET,
2e6599cb 1361 .obj_size = sizeof(struct tcp_request_sock),
5db92c99 1362 .rtx_syn_ack = tcp_rtx_synack,
60236fdd
ACM
1363 .send_ack = tcp_v4_reqsk_send_ack,
1364 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1365 .send_reset = tcp_v4_send_reset,
688d1945 1366 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1367};
1368
b2e4b3de 1369static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
2aec4a29 1370 .mss_clamp = TCP_MSS_DEFAULT,
16bea70a 1371#ifdef CONFIG_TCP_MD5SIG
fd3a154a 1372 .req_md5_lookup = tcp_v4_md5_lookup,
e3afe7b7 1373 .calc_md5_hash = tcp_v4_md5_hash_skb,
b6332e6c 1374#endif
16bea70a 1375 .init_req = tcp_v4_init_req,
fb7b37a7
OP
1376#ifdef CONFIG_SYN_COOKIES
1377 .cookie_init_seq = cookie_v4_init_sequence,
1378#endif
d94e0417 1379 .route_req = tcp_v4_route_req,
84b114b9
ED
1380 .init_seq = tcp_v4_init_seq,
1381 .init_ts_off = tcp_v4_init_ts_off,
d6274bd8 1382 .send_synack = tcp_v4_send_synack,
16bea70a 1383};
cfb6eeb4 1384
1da177e4
LT
1385int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1386{
1da177e4 1387 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1388 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1389 goto drop;
1390
1fb6f159
OP
1391 return tcp_conn_request(&tcp_request_sock_ops,
1392 &tcp_request_sock_ipv4_ops, sk, skb);
1da177e4 1393
1da177e4 1394drop:
9caad864 1395 tcp_listendrop(sk);
1da177e4
LT
1396 return 0;
1397}
4bc2f18b 1398EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1399
1400
1401/*
1402 * The three way handshake has completed - we got a valid synack -
1403 * now create the new socket.
1404 */
0c27171e 1405struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
60236fdd 1406 struct request_sock *req,
5e0724d0
ED
1407 struct dst_entry *dst,
1408 struct request_sock *req_unhash,
1409 bool *own_req)
1da177e4 1410{
2e6599cb 1411 struct inet_request_sock *ireq;
1da177e4
LT
1412 struct inet_sock *newinet;
1413 struct tcp_sock *newtp;
1414 struct sock *newsk;
cfb6eeb4
YH
1415#ifdef CONFIG_TCP_MD5SIG
1416 struct tcp_md5sig_key *key;
1417#endif
f6d8bd05 1418 struct ip_options_rcu *inet_opt;
1da177e4
LT
1419
1420 if (sk_acceptq_is_full(sk))
1421 goto exit_overflow;
1422
1da177e4
LT
1423 newsk = tcp_create_openreq_child(sk, req, skb);
1424 if (!newsk)
093d2823 1425 goto exit_nonewsk;
1da177e4 1426
bcd76111 1427 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1428 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1429
1430 newtp = tcp_sk(newsk);
1431 newinet = inet_sk(newsk);
2e6599cb 1432 ireq = inet_rsk(req);
d1e559d0
ED
1433 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1434 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
6dd9a14e 1435 newsk->sk_bound_dev_if = ireq->ir_iif;
c92e8c02
ED
1436 newinet->inet_saddr = ireq->ir_loc_addr;
1437 inet_opt = rcu_dereference(ireq->ireq_opt);
1438 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
463c84b9 1439 newinet->mc_index = inet_iif(skb);
eddc9ec5 1440 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1441 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1442 inet_csk(newsk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
1443 if (inet_opt)
1444 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
c720c7e8 1445 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1446
dfd25fff
ED
1447 if (!dst) {
1448 dst = inet_csk_route_child_sock(sk, newsk, req);
1449 if (!dst)
1450 goto put_and_exit;
1451 } else {
1452 /* syncookie case : see end of cookie_v4_check() */
1453 }
0e734419
DM
1454 sk_setup_caps(newsk, dst);
1455
81164413
DB
1456 tcp_ca_openreq_child(newsk, dst);
1457
1da177e4 1458 tcp_sync_mss(newsk, dst_mtu(dst));
3541f9e8 1459 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
f5fff5dc 1460
1da177e4
LT
1461 tcp_initialize_rcv_mss(newsk);
1462
cfb6eeb4
YH
1463#ifdef CONFIG_TCP_MD5SIG
1464 /* Copy over the MD5 key from the original socket */
a915da9b
ED
1465 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1466 AF_INET);
00db4124 1467 if (key) {
cfb6eeb4
YH
1468 /*
1469 * We're using one, so create a matching key
1470 * on the newsk structure. If we fail to get
1471 * memory, then we end up not copying the key
1472 * across. Shucks.
1473 */
a915da9b 1474 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
6797318e 1475 AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
a465419b 1476 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1477 }
1478#endif
1479
0e734419
DM
1480 if (__inet_inherit_port(sk, newsk) < 0)
1481 goto put_and_exit;
5e0724d0 1482 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
c92e8c02 1483 if (likely(*own_req)) {
49a496c9 1484 tcp_move_syn(newtp, req);
c92e8c02
ED
1485 ireq->ireq_opt = NULL;
1486 } else {
1487 newinet->inet_opt = NULL;
1488 }
1da177e4
LT
1489 return newsk;
1490
1491exit_overflow:
c10d9310 1492 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1493exit_nonewsk:
1494 dst_release(dst);
1da177e4 1495exit:
9caad864 1496 tcp_listendrop(sk);
1da177e4 1497 return NULL;
0e734419 1498put_and_exit:
c92e8c02 1499 newinet->inet_opt = NULL;
e337e24d
CP
1500 inet_csk_prepare_forced_close(newsk);
1501 tcp_done(newsk);
0e734419 1502 goto exit;
1da177e4 1503}
4bc2f18b 1504EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4 1505
079096f1 1506static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1da177e4 1507{
079096f1 1508#ifdef CONFIG_SYN_COOKIES
52452c54 1509 const struct tcphdr *th = tcp_hdr(skb);
1da177e4 1510
af9b4738 1511 if (!th->syn)
461b74c3 1512 sk = cookie_v4_check(sk, skb);
1da177e4
LT
1513#endif
1514 return sk;
1515}
1516
1da177e4 1517/* The socket must have it's spinlock held when we get
e994b2f0 1518 * here, unless it is a TCP_LISTEN socket.
1da177e4
LT
1519 *
1520 * We have a potential double-lock case here, so even when
1521 * doing backlog processing we use the BH locking scheme.
1522 * This is because we cannot sleep with the original spinlock
1523 * held.
1524 */
1525int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1526{
cfb6eeb4 1527 struct sock *rsk;
cfb6eeb4 1528
1da177e4 1529 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1530 struct dst_entry *dst = sk->sk_rx_dst;
1531
bdeab991 1532 sock_rps_save_rxhash(sk, skb);
3d97379a 1533 sk_mark_napi_id(sk, skb);
404e0a8b 1534 if (dst) {
505fbcf0 1535 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
51456b29 1536 !dst->ops->check(dst, 0)) {
92101b3b
DM
1537 dst_release(dst);
1538 sk->sk_rx_dst = NULL;
1539 }
1540 }
3d97d88e 1541 tcp_rcv_established(sk, skb);
1da177e4
LT
1542 return 0;
1543 }
1544
12e25e10 1545 if (tcp_checksum_complete(skb))
1da177e4
LT
1546 goto csum_err;
1547
1548 if (sk->sk_state == TCP_LISTEN) {
079096f1
ED
1549 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1550
1da177e4
LT
1551 if (!nsk)
1552 goto discard;
1da177e4 1553 if (nsk != sk) {
cfb6eeb4
YH
1554 if (tcp_child_process(sk, nsk, skb)) {
1555 rsk = nsk;
1da177e4 1556 goto reset;
cfb6eeb4 1557 }
1da177e4
LT
1558 return 0;
1559 }
ca55158c 1560 } else
bdeab991 1561 sock_rps_save_rxhash(sk, skb);
ca55158c 1562
72ab4a86 1563 if (tcp_rcv_state_process(sk, skb)) {
cfb6eeb4 1564 rsk = sk;
1da177e4 1565 goto reset;
cfb6eeb4 1566 }
1da177e4
LT
1567 return 0;
1568
1569reset:
cfb6eeb4 1570 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1571discard:
1572 kfree_skb(skb);
1573 /* Be careful here. If this function gets more complicated and
1574 * gcc suffers from register pressure on the x86, sk (in %ebx)
1575 * might be destroyed here. This current version compiles correctly,
1576 * but you have been warned.
1577 */
1578 return 0;
1579
1580csum_err:
c10d9310
ED
1581 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1582 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1583 goto discard;
1584}
4bc2f18b 1585EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1586
7487449c 1587int tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1588{
41063e9d
DM
1589 const struct iphdr *iph;
1590 const struct tcphdr *th;
1591 struct sock *sk;
41063e9d 1592
41063e9d 1593 if (skb->pkt_type != PACKET_HOST)
7487449c 1594 return 0;
41063e9d 1595
45f00f99 1596 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
7487449c 1597 return 0;
41063e9d
DM
1598
1599 iph = ip_hdr(skb);
45f00f99 1600 th = tcp_hdr(skb);
41063e9d
DM
1601
1602 if (th->doff < sizeof(struct tcphdr) / 4)
7487449c 1603 return 0;
41063e9d 1604
45f00f99 1605 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1606 iph->saddr, th->source,
7011d085 1607 iph->daddr, ntohs(th->dest),
3fa6f616 1608 skb->skb_iif, inet_sdif(skb));
41063e9d
DM
1609 if (sk) {
1610 skb->sk = sk;
1611 skb->destructor = sock_edemux;
f7e4eb03 1612 if (sk_fullsock(sk)) {
d0c294c5 1613 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
505fbcf0 1614
41063e9d
DM
1615 if (dst)
1616 dst = dst_check(dst, 0);
92101b3b 1617 if (dst &&
505fbcf0 1618 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1619 skb_dst_set_noref(skb, dst);
41063e9d
DM
1620 }
1621 }
7487449c 1622 return 0;
41063e9d
DM
1623}
1624
c9c33212
ED
1625bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1626{
1627 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
4f693b55
ED
1628 struct skb_shared_info *shinfo;
1629 const struct tcphdr *th;
1630 struct tcphdr *thtail;
1631 struct sk_buff *tail;
1632 unsigned int hdrlen;
1633 bool fragstolen;
1634 u32 gso_segs;
1635 int delta;
c9c33212
ED
1636
1637 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1638 * we can fix skb->truesize to its real value to avoid future drops.
1639 * This is valid because skb is not yet charged to the socket.
1640 * It has been noticed pure SACK packets were sometimes dropped
1641 * (if cooked by drivers without copybreak feature).
1642 */
60b1af33 1643 skb_condense(skb);
c9c33212 1644
ade9628e
ED
1645 skb_dst_drop(skb);
1646
4f693b55
ED
1647 if (unlikely(tcp_checksum_complete(skb))) {
1648 bh_unlock_sock(sk);
1649 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1650 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1651 return true;
1652 }
1653
1654 /* Attempt coalescing to last skb in backlog, even if we are
1655 * above the limits.
1656 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1657 */
1658 th = (const struct tcphdr *)skb->data;
1659 hdrlen = th->doff * 4;
1660 shinfo = skb_shinfo(skb);
1661
1662 if (!shinfo->gso_size)
1663 shinfo->gso_size = skb->len - hdrlen;
1664
1665 if (!shinfo->gso_segs)
1666 shinfo->gso_segs = 1;
1667
1668 tail = sk->sk_backlog.tail;
1669 if (!tail)
1670 goto no_coalesce;
1671 thtail = (struct tcphdr *)tail->data;
1672
1673 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1674 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1675 ((TCP_SKB_CB(tail)->tcp_flags |
1676 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) ||
1677 ((TCP_SKB_CB(tail)->tcp_flags ^
1678 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1679#ifdef CONFIG_TLS_DEVICE
1680 tail->decrypted != skb->decrypted ||
1681#endif
1682 thtail->doff != th->doff ||
1683 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1684 goto no_coalesce;
1685
1686 __skb_pull(skb, hdrlen);
1687 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1688 thtail->window = th->window;
1689
1690 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1691
1692 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1693 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1694
1695 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1696
1697 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1698 TCP_SKB_CB(tail)->has_rxtstamp = true;
1699 tail->tstamp = skb->tstamp;
1700 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1701 }
1702
1703 /* Not as strict as GRO. We only need to carry mss max value */
1704 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1705 skb_shinfo(tail)->gso_size);
1706
1707 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1708 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1709
1710 sk->sk_backlog.len += delta;
1711 __NET_INC_STATS(sock_net(sk),
1712 LINUX_MIB_TCPBACKLOGCOALESCE);
1713 kfree_skb_partial(skb, fragstolen);
1714 return false;
1715 }
1716 __skb_push(skb, hdrlen);
1717
1718no_coalesce:
1719 /* Only socket owner can try to collapse/prune rx queues
1720 * to reduce memory overhead, so add a little headroom here.
1721 * Few sockets backlog are possibly concurrently non empty.
1722 */
1723 limit += 64*1024;
1724
c9c33212
ED
1725 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1726 bh_unlock_sock(sk);
1727 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1728 return true;
1729 }
1730 return false;
1731}
1732EXPORT_SYMBOL(tcp_add_backlog);
1733
ac6e7800
ED
1734int tcp_filter(struct sock *sk, struct sk_buff *skb)
1735{
1736 struct tcphdr *th = (struct tcphdr *)skb->data;
ac6e7800 1737
f2feaefd 1738 return sk_filter_trim_cap(sk, skb, th->doff * 4);
ac6e7800
ED
1739}
1740EXPORT_SYMBOL(tcp_filter);
1741
eeea10b8
ED
1742static void tcp_v4_restore_cb(struct sk_buff *skb)
1743{
1744 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1745 sizeof(struct inet_skb_parm));
1746}
1747
1748static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1749 const struct tcphdr *th)
1750{
1751 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1752 * barrier() makes sure compiler wont play fool^Waliasing games.
1753 */
1754 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1755 sizeof(struct inet_skb_parm));
1756 barrier();
1757
1758 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1759 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1760 skb->len - th->doff * 4);
1761 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1762 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1763 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1764 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1765 TCP_SKB_CB(skb)->sacked = 0;
1766 TCP_SKB_CB(skb)->has_rxtstamp =
1767 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1768}
1769
1da177e4
LT
1770/*
1771 * From tcp_input.c
1772 */
1773
1774int tcp_v4_rcv(struct sk_buff *skb)
1775{
3b24d854 1776 struct net *net = dev_net(skb->dev);
3fa6f616 1777 int sdif = inet_sdif(skb);
eddc9ec5 1778 const struct iphdr *iph;
cf533ea5 1779 const struct tcphdr *th;
3b24d854 1780 bool refcounted;
1da177e4
LT
1781 struct sock *sk;
1782 int ret;
1783
1784 if (skb->pkt_type != PACKET_HOST)
1785 goto discard_it;
1786
1787 /* Count it even if it's bad */
90bbcc60 1788 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1da177e4
LT
1789
1790 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1791 goto discard_it;
1792
ea1627c2 1793 th = (const struct tcphdr *)skb->data;
1da177e4 1794
ea1627c2 1795 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1da177e4
LT
1796 goto bad_packet;
1797 if (!pskb_may_pull(skb, th->doff * 4))
1798 goto discard_it;
1799
1800 /* An explanation is required here, I think.
1801 * Packet length and doff are validated by header prediction,
caa20d9a 1802 * provided case of th->doff==0 is eliminated.
1da177e4 1803 * So, we defer the checks. */
ed70fcfc
TH
1804
1805 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
6a5dc9e5 1806 goto csum_error;
1da177e4 1807
ea1627c2 1808 th = (const struct tcphdr *)skb->data;
eddc9ec5 1809 iph = ip_hdr(skb);
4bdc3d66 1810lookup:
a583636a 1811 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
3fa6f616 1812 th->dest, sdif, &refcounted);
1da177e4
LT
1813 if (!sk)
1814 goto no_tcp_socket;
1815
bb134d5d
ED
1816process:
1817 if (sk->sk_state == TCP_TIME_WAIT)
1818 goto do_time_wait;
1819
079096f1
ED
1820 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1821 struct request_sock *req = inet_reqsk(sk);
e0f9759f 1822 bool req_stolen = false;
7716682c 1823 struct sock *nsk;
079096f1
ED
1824
1825 sk = req->rsk_listener;
72923555 1826 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
e65c332d 1827 sk_drops_add(sk, skb);
72923555
ED
1828 reqsk_put(req);
1829 goto discard_it;
1830 }
4fd44a98
FL
1831 if (tcp_checksum_complete(skb)) {
1832 reqsk_put(req);
1833 goto csum_error;
1834 }
7716682c 1835 if (unlikely(sk->sk_state != TCP_LISTEN)) {
f03f2e15 1836 inet_csk_reqsk_queue_drop_and_put(sk, req);
4bdc3d66
ED
1837 goto lookup;
1838 }
3b24d854
ED
1839 /* We own a reference on the listener, increase it again
1840 * as we might lose it too soon.
1841 */
7716682c 1842 sock_hold(sk);
3b24d854 1843 refcounted = true;
1f3b359f 1844 nsk = NULL;
eeea10b8
ED
1845 if (!tcp_filter(sk, skb)) {
1846 th = (const struct tcphdr *)skb->data;
1847 iph = ip_hdr(skb);
1848 tcp_v4_fill_cb(skb, iph, th);
e0f9759f 1849 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
eeea10b8 1850 }
079096f1
ED
1851 if (!nsk) {
1852 reqsk_put(req);
e0f9759f
ED
1853 if (req_stolen) {
1854 /* Another cpu got exclusive access to req
1855 * and created a full blown socket.
1856 * Try to feed this packet to this socket
1857 * instead of discarding it.
1858 */
1859 tcp_v4_restore_cb(skb);
1860 sock_put(sk);
1861 goto lookup;
1862 }
7716682c 1863 goto discard_and_relse;
079096f1
ED
1864 }
1865 if (nsk == sk) {
079096f1 1866 reqsk_put(req);
eeea10b8 1867 tcp_v4_restore_cb(skb);
079096f1
ED
1868 } else if (tcp_child_process(sk, nsk, skb)) {
1869 tcp_v4_send_reset(nsk, skb);
7716682c 1870 goto discard_and_relse;
079096f1 1871 } else {
7716682c 1872 sock_put(sk);
079096f1
ED
1873 return 0;
1874 }
1875 }
6cce09f8 1876 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 1877 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1878 goto discard_and_relse;
6cce09f8 1879 }
d218d111 1880
1da177e4
LT
1881 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1882 goto discard_and_relse;
9ea88a15 1883
9ea88a15
DP
1884 if (tcp_v4_inbound_md5_hash(sk, skb))
1885 goto discard_and_relse;
9ea88a15 1886
b59c2701 1887 nf_reset(skb);
1da177e4 1888
ac6e7800 1889 if (tcp_filter(sk, skb))
1da177e4 1890 goto discard_and_relse;
ac6e7800
ED
1891 th = (const struct tcphdr *)skb->data;
1892 iph = ip_hdr(skb);
eeea10b8 1893 tcp_v4_fill_cb(skb, iph, th);
1da177e4
LT
1894
1895 skb->dev = NULL;
1896
e994b2f0
ED
1897 if (sk->sk_state == TCP_LISTEN) {
1898 ret = tcp_v4_do_rcv(sk, skb);
1899 goto put_and_return;
1900 }
1901
1902 sk_incoming_cpu_update(sk);
1903
c6366184 1904 bh_lock_sock_nested(sk);
a44d6eac 1905 tcp_segs_in(tcp_sk(sk), skb);
1da177e4
LT
1906 ret = 0;
1907 if (!sock_owned_by_user(sk)) {
e7942d06 1908 ret = tcp_v4_do_rcv(sk, skb);
c9c33212 1909 } else if (tcp_add_backlog(sk, skb)) {
6b03a53a
ZY
1910 goto discard_and_relse;
1911 }
1da177e4
LT
1912 bh_unlock_sock(sk);
1913
e994b2f0 1914put_and_return:
3b24d854
ED
1915 if (refcounted)
1916 sock_put(sk);
1da177e4
LT
1917
1918 return ret;
1919
1920no_tcp_socket:
1921 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1922 goto discard_it;
1923
eeea10b8
ED
1924 tcp_v4_fill_cb(skb, iph, th);
1925
12e25e10 1926 if (tcp_checksum_complete(skb)) {
6a5dc9e5 1927csum_error:
90bbcc60 1928 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1da177e4 1929bad_packet:
90bbcc60 1930 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1da177e4 1931 } else {
cfb6eeb4 1932 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
1933 }
1934
1935discard_it:
1936 /* Discard frame. */
1937 kfree_skb(skb);
e905a9ed 1938 return 0;
1da177e4
LT
1939
1940discard_and_relse:
532182cd 1941 sk_drops_add(sk, skb);
3b24d854
ED
1942 if (refcounted)
1943 sock_put(sk);
1da177e4
LT
1944 goto discard_it;
1945
1946do_time_wait:
1947 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 1948 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1949 goto discard_it;
1950 }
1951
eeea10b8
ED
1952 tcp_v4_fill_cb(skb, iph, th);
1953
6a5dc9e5
ED
1954 if (tcp_checksum_complete(skb)) {
1955 inet_twsk_put(inet_twsk(sk));
1956 goto csum_error;
1da177e4 1957 }
9469c7b4 1958 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 1959 case TCP_TW_SYN: {
c346dca1 1960 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
a583636a
CG
1961 &tcp_hashinfo, skb,
1962 __tcp_hdrlen(th),
da5e3630 1963 iph->saddr, th->source,
eddc9ec5 1964 iph->daddr, th->dest,
3fa6f616
DA
1965 inet_iif(skb),
1966 sdif);
1da177e4 1967 if (sk2) {
dbe7faa4 1968 inet_twsk_deschedule_put(inet_twsk(sk));
1da177e4 1969 sk = sk2;
eeea10b8 1970 tcp_v4_restore_cb(skb);
3b24d854 1971 refcounted = false;
1da177e4
LT
1972 goto process;
1973 }
1da177e4 1974 }
fcfd6dfa
GS
1975 /* to ACK */
1976 /* fall through */
1da177e4
LT
1977 case TCP_TW_ACK:
1978 tcp_v4_timewait_ack(sk, skb);
1979 break;
1980 case TCP_TW_RST:
271c3b9b
FW
1981 tcp_v4_send_reset(sk, skb);
1982 inet_twsk_deschedule_put(inet_twsk(sk));
1983 goto discard_it;
1da177e4
LT
1984 case TCP_TW_SUCCESS:;
1985 }
1986 goto discard_it;
1987}
1988
ccb7c410
DM
1989static struct timewait_sock_ops tcp_timewait_sock_ops = {
1990 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1991 .twsk_unique = tcp_twsk_unique,
1992 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 1993};
1da177e4 1994
63d02d15 1995void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
1996{
1997 struct dst_entry *dst = skb_dst(skb);
1998
5037e9ef 1999 if (dst && dst_hold_safe(dst)) {
ca777eff
ED
2000 sk->sk_rx_dst = dst;
2001 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2002 }
5d299f3d 2003}
63d02d15 2004EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 2005
3b401a81 2006const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
2007 .queue_xmit = ip_queue_xmit,
2008 .send_check = tcp_v4_send_check,
2009 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 2010 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
2011 .conn_request = tcp_v4_conn_request,
2012 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
2013 .net_header_len = sizeof(struct iphdr),
2014 .setsockopt = ip_setsockopt,
2015 .getsockopt = ip_getsockopt,
2016 .addr2sockaddr = inet_csk_addr2sockaddr,
2017 .sockaddr_len = sizeof(struct sockaddr_in),
3fdadf7d 2018#ifdef CONFIG_COMPAT
543d9cfe
ACM
2019 .compat_setsockopt = compat_ip_setsockopt,
2020 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 2021#endif
4fab9071 2022 .mtu_reduced = tcp_v4_mtu_reduced,
1da177e4 2023};
4bc2f18b 2024EXPORT_SYMBOL(ipv4_specific);
1da177e4 2025
cfb6eeb4 2026#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 2027static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 2028 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 2029 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 2030 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 2031};
b6332e6c 2032#endif
cfb6eeb4 2033
1da177e4
LT
2034/* NOTE: A lot of things set to zero explicitly by call to
2035 * sk_alloc() so need not be done here.
2036 */
2037static int tcp_v4_init_sock(struct sock *sk)
2038{
6687e988 2039 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 2040
900f65d3 2041 tcp_init_sock(sk);
1da177e4 2042
8292a17a 2043 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 2044
cfb6eeb4 2045#ifdef CONFIG_TCP_MD5SIG
ac807fa8 2046 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 2047#endif
1da177e4 2048
1da177e4
LT
2049 return 0;
2050}
2051
7d06b2e0 2052void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
2053{
2054 struct tcp_sock *tp = tcp_sk(sk);
2055
e1a4aa50
SL
2056 trace_tcp_destroy_sock(sk);
2057
1da177e4
LT
2058 tcp_clear_xmit_timers(sk);
2059
6687e988 2060 tcp_cleanup_congestion_control(sk);
317a76f9 2061
734942cc
DW
2062 tcp_cleanup_ulp(sk);
2063
1da177e4 2064 /* Cleanup up the write buffer. */
fe067e8a 2065 tcp_write_queue_purge(sk);
1da177e4 2066
cf1ef3f0
WW
2067 /* Check if we want to disable active TFO */
2068 tcp_fastopen_active_disable_ofo_check(sk);
2069
1da177e4 2070 /* Cleans up our, hopefully empty, out_of_order_queue. */
9f5afeae 2071 skb_rbtree_purge(&tp->out_of_order_queue);
1da177e4 2072
cfb6eeb4
YH
2073#ifdef CONFIG_TCP_MD5SIG
2074 /* Clean up the MD5 key list, if any */
2075 if (tp->md5sig_info) {
a915da9b 2076 tcp_clear_md5_list(sk);
fb7df5e4 2077 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
cfb6eeb4
YH
2078 tp->md5sig_info = NULL;
2079 }
2080#endif
1a2449a8 2081
1da177e4 2082 /* Clean up a referenced TCP bind bucket. */
463c84b9 2083 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 2084 inet_put_port(sk);
1da177e4 2085
00db4124 2086 BUG_ON(tp->fastopen_rsk);
435cf559 2087
cf60af03
YC
2088 /* If socket is aborted during connect operation */
2089 tcp_free_fastopen_req(tp);
1fba70e5 2090 tcp_fastopen_destroy_cipher(sk);
cd8ae852 2091 tcp_saved_syn_free(tp);
cf60af03 2092
180d8cd9 2093 sk_sockets_allocated_dec(sk);
1da177e4 2094}
1da177e4
LT
2095EXPORT_SYMBOL(tcp_v4_destroy_sock);
2096
2097#ifdef CONFIG_PROC_FS
2098/* Proc filesystem TCP sock list dumping. */
2099
a8b690f9
TH
2100/*
2101 * Get next listener socket follow cur. If cur is NULL, get first socket
2102 * starting from bucket given in st->bucket; when st->bucket is zero the
2103 * very first socket in the hash table is returned.
2104 */
1da177e4
LT
2105static void *listening_get_next(struct seq_file *seq, void *cur)
2106{
37d849bb 2107 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
5799de0b 2108 struct tcp_iter_state *st = seq->private;
a4146b1b 2109 struct net *net = seq_file_net(seq);
3b24d854 2110 struct inet_listen_hashbucket *ilb;
3b24d854 2111 struct sock *sk = cur;
1da177e4
LT
2112
2113 if (!sk) {
3b24d854 2114get_head:
a8b690f9 2115 ilb = &tcp_hashinfo.listening_hash[st->bucket];
9652dc2e 2116 spin_lock(&ilb->lock);
3b24d854 2117 sk = sk_head(&ilb->head);
a8b690f9 2118 st->offset = 0;
1da177e4
LT
2119 goto get_sk;
2120 }
5caea4ea 2121 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 2122 ++st->num;
a8b690f9 2123 ++st->offset;
1da177e4 2124
3b24d854 2125 sk = sk_next(sk);
1da177e4 2126get_sk:
3b24d854 2127 sk_for_each_from(sk) {
8475ef9f
PE
2128 if (!net_eq(sock_net(sk), net))
2129 continue;
37d849bb 2130 if (sk->sk_family == afinfo->family)
3b24d854 2131 return sk;
1da177e4 2132 }
9652dc2e 2133 spin_unlock(&ilb->lock);
a8b690f9 2134 st->offset = 0;
3b24d854
ED
2135 if (++st->bucket < INET_LHTABLE_SIZE)
2136 goto get_head;
2137 return NULL;
1da177e4
LT
2138}
2139
2140static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2141{
a8b690f9
TH
2142 struct tcp_iter_state *st = seq->private;
2143 void *rc;
2144
2145 st->bucket = 0;
2146 st->offset = 0;
2147 rc = listening_get_next(seq, NULL);
1da177e4
LT
2148
2149 while (rc && *pos) {
2150 rc = listening_get_next(seq, rc);
2151 --*pos;
2152 }
2153 return rc;
2154}
2155
05dbc7b5 2156static inline bool empty_bucket(const struct tcp_iter_state *st)
6eac5604 2157{
05dbc7b5 2158 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
6eac5604
AK
2159}
2160
a8b690f9
TH
2161/*
2162 * Get first established socket starting from bucket given in st->bucket.
2163 * If st->bucket is zero, the very first socket in the hash is returned.
2164 */
1da177e4
LT
2165static void *established_get_first(struct seq_file *seq)
2166{
37d849bb 2167 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
5799de0b 2168 struct tcp_iter_state *st = seq->private;
a4146b1b 2169 struct net *net = seq_file_net(seq);
1da177e4
LT
2170 void *rc = NULL;
2171
a8b690f9
TH
2172 st->offset = 0;
2173 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 2174 struct sock *sk;
3ab5aee7 2175 struct hlist_nulls_node *node;
9db66bdc 2176 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 2177
6eac5604
AK
2178 /* Lockless fast path for the common case of empty buckets */
2179 if (empty_bucket(st))
2180 continue;
2181
9db66bdc 2182 spin_lock_bh(lock);
3ab5aee7 2183 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
37d849bb 2184 if (sk->sk_family != afinfo->family ||
878628fb 2185 !net_eq(sock_net(sk), net)) {
1da177e4
LT
2186 continue;
2187 }
2188 rc = sk;
2189 goto out;
2190 }
9db66bdc 2191 spin_unlock_bh(lock);
1da177e4
LT
2192 }
2193out:
2194 return rc;
2195}
2196
2197static void *established_get_next(struct seq_file *seq, void *cur)
2198{
37d849bb 2199 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
1da177e4 2200 struct sock *sk = cur;
3ab5aee7 2201 struct hlist_nulls_node *node;
5799de0b 2202 struct tcp_iter_state *st = seq->private;
a4146b1b 2203 struct net *net = seq_file_net(seq);
1da177e4
LT
2204
2205 ++st->num;
a8b690f9 2206 ++st->offset;
1da177e4 2207
05dbc7b5 2208 sk = sk_nulls_next(sk);
1da177e4 2209
3ab5aee7 2210 sk_nulls_for_each_from(sk, node) {
37d849bb
CH
2211 if (sk->sk_family == afinfo->family &&
2212 net_eq(sock_net(sk), net))
05dbc7b5 2213 return sk;
1da177e4
LT
2214 }
2215
05dbc7b5
ED
2216 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2217 ++st->bucket;
2218 return established_get_first(seq);
1da177e4
LT
2219}
2220
2221static void *established_get_idx(struct seq_file *seq, loff_t pos)
2222{
a8b690f9
TH
2223 struct tcp_iter_state *st = seq->private;
2224 void *rc;
2225
2226 st->bucket = 0;
2227 rc = established_get_first(seq);
1da177e4
LT
2228
2229 while (rc && pos) {
2230 rc = established_get_next(seq, rc);
2231 --pos;
7174259e 2232 }
1da177e4
LT
2233 return rc;
2234}
2235
2236static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2237{
2238 void *rc;
5799de0b 2239 struct tcp_iter_state *st = seq->private;
1da177e4 2240
1da177e4
LT
2241 st->state = TCP_SEQ_STATE_LISTENING;
2242 rc = listening_get_idx(seq, &pos);
2243
2244 if (!rc) {
1da177e4
LT
2245 st->state = TCP_SEQ_STATE_ESTABLISHED;
2246 rc = established_get_idx(seq, pos);
2247 }
2248
2249 return rc;
2250}
2251
a8b690f9
TH
2252static void *tcp_seek_last_pos(struct seq_file *seq)
2253{
2254 struct tcp_iter_state *st = seq->private;
2255 int offset = st->offset;
2256 int orig_num = st->num;
2257 void *rc = NULL;
2258
2259 switch (st->state) {
a8b690f9
TH
2260 case TCP_SEQ_STATE_LISTENING:
2261 if (st->bucket >= INET_LHTABLE_SIZE)
2262 break;
2263 st->state = TCP_SEQ_STATE_LISTENING;
2264 rc = listening_get_next(seq, NULL);
2265 while (offset-- && rc)
2266 rc = listening_get_next(seq, rc);
2267 if (rc)
2268 break;
2269 st->bucket = 0;
05dbc7b5 2270 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2271 /* Fallthrough */
2272 case TCP_SEQ_STATE_ESTABLISHED:
a8b690f9
TH
2273 if (st->bucket > tcp_hashinfo.ehash_mask)
2274 break;
2275 rc = established_get_first(seq);
2276 while (offset-- && rc)
2277 rc = established_get_next(seq, rc);
2278 }
2279
2280 st->num = orig_num;
2281
2282 return rc;
2283}
2284
37d849bb 2285void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1da177e4 2286{
5799de0b 2287 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2288 void *rc;
2289
2290 if (*pos && *pos == st->last_pos) {
2291 rc = tcp_seek_last_pos(seq);
2292 if (rc)
2293 goto out;
2294 }
2295
1da177e4
LT
2296 st->state = TCP_SEQ_STATE_LISTENING;
2297 st->num = 0;
a8b690f9
TH
2298 st->bucket = 0;
2299 st->offset = 0;
2300 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2301
2302out:
2303 st->last_pos = *pos;
2304 return rc;
1da177e4 2305}
37d849bb 2306EXPORT_SYMBOL(tcp_seq_start);
1da177e4 2307
37d849bb 2308void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1da177e4 2309{
a8b690f9 2310 struct tcp_iter_state *st = seq->private;
1da177e4 2311 void *rc = NULL;
1da177e4
LT
2312
2313 if (v == SEQ_START_TOKEN) {
2314 rc = tcp_get_idx(seq, 0);
2315 goto out;
2316 }
1da177e4
LT
2317
2318 switch (st->state) {
1da177e4
LT
2319 case TCP_SEQ_STATE_LISTENING:
2320 rc = listening_get_next(seq, v);
2321 if (!rc) {
1da177e4 2322 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2323 st->bucket = 0;
2324 st->offset = 0;
1da177e4
LT
2325 rc = established_get_first(seq);
2326 }
2327 break;
2328 case TCP_SEQ_STATE_ESTABLISHED:
1da177e4
LT
2329 rc = established_get_next(seq, v);
2330 break;
2331 }
2332out:
2333 ++*pos;
a8b690f9 2334 st->last_pos = *pos;
1da177e4
LT
2335 return rc;
2336}
37d849bb 2337EXPORT_SYMBOL(tcp_seq_next);
1da177e4 2338
37d849bb 2339void tcp_seq_stop(struct seq_file *seq, void *v)
1da177e4 2340{
5799de0b 2341 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2342
2343 switch (st->state) {
1da177e4
LT
2344 case TCP_SEQ_STATE_LISTENING:
2345 if (v != SEQ_START_TOKEN)
9652dc2e 2346 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4 2347 break;
1da177e4
LT
2348 case TCP_SEQ_STATE_ESTABLISHED:
2349 if (v)
9db66bdc 2350 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2351 break;
2352 }
2353}
37d849bb 2354EXPORT_SYMBOL(tcp_seq_stop);
1da177e4 2355
d4f06873 2356static void get_openreq4(const struct request_sock *req,
aa3a0c8c 2357 struct seq_file *f, int i)
1da177e4 2358{
2e6599cb 2359 const struct inet_request_sock *ireq = inet_rsk(req);
fa76ce73 2360 long delta = req->rsk_timer.expires - jiffies;
1da177e4 2361
5e659e4c 2362 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2363 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
1da177e4 2364 i,
634fb979 2365 ireq->ir_loc_addr,
d4f06873 2366 ireq->ir_num,
634fb979
ED
2367 ireq->ir_rmt_addr,
2368 ntohs(ireq->ir_rmt_port),
1da177e4
LT
2369 TCP_SYN_RECV,
2370 0, 0, /* could print option size, but that is af dependent. */
2371 1, /* timers active (only the expire timer) */
a399a805 2372 jiffies_delta_to_clock_t(delta),
e6c022a4 2373 req->num_timeout,
aa3a0c8c
ED
2374 from_kuid_munged(seq_user_ns(f),
2375 sock_i_uid(req->rsk_listener)),
1da177e4
LT
2376 0, /* non standard timer */
2377 0, /* open_requests have no inode */
d4f06873 2378 0,
652586df 2379 req);
1da177e4
LT
2380}
2381
652586df 2382static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
1da177e4
LT
2383{
2384 int timer_active;
2385 unsigned long timer_expires;
cf533ea5 2386 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2387 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2388 const struct inet_sock *inet = inet_sk(sk);
0536fcc0 2389 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2390 __be32 dest = inet->inet_daddr;
2391 __be32 src = inet->inet_rcv_saddr;
2392 __u16 destp = ntohs(inet->inet_dport);
2393 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2394 int rx_queue;
00fd38d9 2395 int state;
1da177e4 2396
6ba8a3b1 2397 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
57dde7f7 2398 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
6ba8a3b1 2399 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2400 timer_active = 1;
463c84b9
ACM
2401 timer_expires = icsk->icsk_timeout;
2402 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2403 timer_active = 4;
463c84b9 2404 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2405 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2406 timer_active = 2;
cf4c6bf8 2407 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2408 } else {
2409 timer_active = 0;
2410 timer_expires = jiffies;
2411 }
2412
986ffdfd 2413 state = inet_sk_state_load(sk);
00fd38d9 2414 if (state == TCP_LISTEN)
49d09007
ED
2415 rx_queue = sk->sk_ack_backlog;
2416 else
00fd38d9
ED
2417 /* Because we don't lock the socket,
2418 * we might find a transient negative value.
49d09007
ED
2419 */
2420 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2421
5e659e4c 2422 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
652586df 2423 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
00fd38d9 2424 i, src, srcp, dest, destp, state,
47da8ee6 2425 tp->write_seq - tp->snd_una,
49d09007 2426 rx_queue,
1da177e4 2427 timer_active,
a399a805 2428 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2429 icsk->icsk_retransmits,
a7cb5a49 2430 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2431 icsk->icsk_probes_out,
cf4c6bf8 2432 sock_i_ino(sk),
41c6d650 2433 refcount_read(&sk->sk_refcnt), sk,
7be87351
SH
2434 jiffies_to_clock_t(icsk->icsk_rto),
2435 jiffies_to_clock_t(icsk->icsk_ack.ato),
31954cd8 2436 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
1da177e4 2437 tp->snd_cwnd,
00fd38d9
ED
2438 state == TCP_LISTEN ?
2439 fastopenq->max_qlen :
652586df 2440 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
1da177e4
LT
2441}
2442
cf533ea5 2443static void get_timewait4_sock(const struct inet_timewait_sock *tw,
652586df 2444 struct seq_file *f, int i)
1da177e4 2445{
789f558c 2446 long delta = tw->tw_timer.expires - jiffies;
23f33c2d 2447 __be32 dest, src;
1da177e4 2448 __u16 destp, srcp;
1da177e4
LT
2449
2450 dest = tw->tw_daddr;
2451 src = tw->tw_rcv_saddr;
2452 destp = ntohs(tw->tw_dport);
2453 srcp = ntohs(tw->tw_sport);
2454
5e659e4c 2455 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2456 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
1da177e4 2457 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2458 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
41c6d650 2459 refcount_read(&tw->tw_refcnt), tw);
1da177e4
LT
2460}
2461
2462#define TMPSZ 150
2463
2464static int tcp4_seq_show(struct seq_file *seq, void *v)
2465{
5799de0b 2466 struct tcp_iter_state *st;
05dbc7b5 2467 struct sock *sk = v;
1da177e4 2468
652586df 2469 seq_setwidth(seq, TMPSZ - 1);
1da177e4 2470 if (v == SEQ_START_TOKEN) {
652586df 2471 seq_puts(seq, " sl local_address rem_address st tx_queue "
1da177e4
LT
2472 "rx_queue tr tm->when retrnsmt uid timeout "
2473 "inode");
2474 goto out;
2475 }
2476 st = seq->private;
2477
079096f1
ED
2478 if (sk->sk_state == TCP_TIME_WAIT)
2479 get_timewait4_sock(v, seq, st->num);
2480 else if (sk->sk_state == TCP_NEW_SYN_RECV)
aa3a0c8c 2481 get_openreq4(v, seq, st->num);
079096f1
ED
2482 else
2483 get_tcp4_sock(v, seq, st->num);
1da177e4 2484out:
652586df 2485 seq_pad(seq, '\n');
1da177e4
LT
2486 return 0;
2487}
2488
37d849bb
CH
2489static const struct seq_operations tcp4_seq_ops = {
2490 .show = tcp4_seq_show,
2491 .start = tcp_seq_start,
2492 .next = tcp_seq_next,
2493 .stop = tcp_seq_stop,
2494};
2495
1da177e4 2496static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4 2497 .family = AF_INET,
1da177e4
LT
2498};
2499
2c8c1e72 2500static int __net_init tcp4_proc_init_net(struct net *net)
757764f6 2501{
c3506372
CH
2502 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2503 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
37d849bb
CH
2504 return -ENOMEM;
2505 return 0;
757764f6
PE
2506}
2507
2c8c1e72 2508static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6 2509{
37d849bb 2510 remove_proc_entry("tcp", net->proc_net);
757764f6
PE
2511}
2512
2513static struct pernet_operations tcp4_net_ops = {
2514 .init = tcp4_proc_init_net,
2515 .exit = tcp4_proc_exit_net,
2516};
2517
1da177e4
LT
2518int __init tcp4_proc_init(void)
2519{
757764f6 2520 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2521}
2522
2523void tcp4_proc_exit(void)
2524{
757764f6 2525 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2526}
2527#endif /* CONFIG_PROC_FS */
2528
2529struct proto tcp_prot = {
2530 .name = "TCP",
2531 .owner = THIS_MODULE,
2532 .close = tcp_close,
d74bad4e 2533 .pre_connect = tcp_v4_pre_connect,
1da177e4
LT
2534 .connect = tcp_v4_connect,
2535 .disconnect = tcp_disconnect,
463c84b9 2536 .accept = inet_csk_accept,
1da177e4
LT
2537 .ioctl = tcp_ioctl,
2538 .init = tcp_v4_init_sock,
2539 .destroy = tcp_v4_destroy_sock,
2540 .shutdown = tcp_shutdown,
2541 .setsockopt = tcp_setsockopt,
2542 .getsockopt = tcp_getsockopt,
4b9d07a4 2543 .keepalive = tcp_set_keepalive,
1da177e4 2544 .recvmsg = tcp_recvmsg,
7ba42910
CG
2545 .sendmsg = tcp_sendmsg,
2546 .sendpage = tcp_sendpage,
1da177e4 2547 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 2548 .release_cb = tcp_release_cb,
ab1e0a13
ACM
2549 .hash = inet_hash,
2550 .unhash = inet_unhash,
2551 .get_port = inet_csk_get_port,
1da177e4 2552 .enter_memory_pressure = tcp_enter_memory_pressure,
06044751 2553 .leave_memory_pressure = tcp_leave_memory_pressure,
c9bee3b7 2554 .stream_memory_free = tcp_stream_memory_free,
1da177e4 2555 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2556 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2557 .memory_allocated = &tcp_memory_allocated,
2558 .memory_pressure = &tcp_memory_pressure,
a4fe34bf 2559 .sysctl_mem = sysctl_tcp_mem,
356d1833
ED
2560 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2561 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
1da177e4
LT
2562 .max_header = MAX_TCP_HEADER,
2563 .obj_size = sizeof(struct tcp_sock),
5f0d5a3a 2564 .slab_flags = SLAB_TYPESAFE_BY_RCU,
6d6ee43e 2565 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2566 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2567 .h.hashinfo = &tcp_hashinfo,
7ba42910 2568 .no_autobind = true,
543d9cfe
ACM
2569#ifdef CONFIG_COMPAT
2570 .compat_setsockopt = compat_tcp_setsockopt,
2571 .compat_getsockopt = compat_tcp_getsockopt,
d1a4c0b3 2572#endif
c1e64e29 2573 .diag_destroy = tcp_abort,
1da177e4 2574};
4bc2f18b 2575EXPORT_SYMBOL(tcp_prot);
1da177e4 2576
bdbbb852
ED
2577static void __net_exit tcp_sk_exit(struct net *net)
2578{
2579 int cpu;
2580
b506bc97
DL
2581 if (net->ipv4.tcp_congestion_control)
2582 module_put(net->ipv4.tcp_congestion_control->owner);
6670e152 2583
bdbbb852
ED
2584 for_each_possible_cpu(cpu)
2585 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2586 free_percpu(net->ipv4.tcp_sk);
2587}
2588
046ee902
DL
2589static int __net_init tcp_sk_init(struct net *net)
2590{
fee83d09 2591 int res, cpu, cnt;
bdbbb852
ED
2592
2593 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2594 if (!net->ipv4.tcp_sk)
2595 return -ENOMEM;
2596
2597 for_each_possible_cpu(cpu) {
2598 struct sock *sk;
2599
2600 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2601 IPPROTO_TCP, net);
2602 if (res)
2603 goto fail;
a9d6532b 2604 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
431280ee
ED
2605
2606 /* Please enforce IP_DF and IPID==0 for RST and
2607 * ACK sent in SYN-RECV and TIME-WAIT state.
2608 */
2609 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2610
bdbbb852
ED
2611 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2612 }
49213555 2613
5d134f1c 2614 net->ipv4.sysctl_tcp_ecn = 2;
49213555
DB
2615 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2616
b0f9ca53 2617 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
6b58e0a5 2618 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
05cbc0db 2619 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
046ee902 2620
13b287e8 2621 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
9bd6861b 2622 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
b840d15d 2623 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
13b287e8 2624
6fa25166 2625 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
7c083ecb 2626 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
0aca737d 2627 net->ipv4.sysctl_tcp_syncookies = 1;
1043e25f 2628 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
ae5c3f40 2629 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
c6214a97 2630 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
c402d9be 2631 net->ipv4.sysctl_tcp_orphan_retries = 0;
1e579caa 2632 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
4979f2d9 2633 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
79e9fed4 2634 net->ipv4.sysctl_tcp_tw_reuse = 2;
12ed8244 2635
fee83d09 2636 cnt = tcp_hashinfo.ehash_mask + 1;
743e4815 2637 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
1946e672
HY
2638 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2639
fee83d09 2640 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
f9301034 2641 net->ipv4.sysctl_tcp_sack = 1;
9bb37ef0 2642 net->ipv4.sysctl_tcp_window_scaling = 1;
5d2ed052 2643 net->ipv4.sysctl_tcp_timestamps = 1;
2ae21cf5 2644 net->ipv4.sysctl_tcp_early_retrans = 3;
e20223f1 2645 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
b510f0d2 2646 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
e0a1e5b5 2647 net->ipv4.sysctl_tcp_retrans_collapse = 1;
c6e21803 2648 net->ipv4.sysctl_tcp_max_reordering = 300;
6496f6bd 2649 net->ipv4.sysctl_tcp_dsack = 1;
0c12654a 2650 net->ipv4.sysctl_tcp_app_win = 31;
94f0893e 2651 net->ipv4.sysctl_tcp_adv_win_scale = 1;
af9b69a7 2652 net->ipv4.sysctl_tcp_frto = 2;
4540c0cf 2653 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
d06a9904
ED
2654 /* This limits the percentage of the congestion window which we
2655 * will allow a single TSO frame to consume. Building TSO frames
2656 * which are too large can cause TCP streams to be bursty.
2657 */
2658 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
c73e5807
ED
2659 /* Default TSQ limit of 16 TSO segments */
2660 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
b530b681
ED
2661 /* rfc5961 challenge ack rate limiting */
2662 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
26e9596e 2663 net->ipv4.sysctl_tcp_min_tso_segs = 2;
bd239704 2664 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
790f00e1 2665 net->ipv4.sysctl_tcp_autocorking = 1;
4170ba6b 2666 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
23a7102a 2667 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
c26e91f8 2668 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
356d1833
ED
2669 if (net != &init_net) {
2670 memcpy(net->ipv4.sysctl_tcp_rmem,
2671 init_net.ipv4.sysctl_tcp_rmem,
2672 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2673 memcpy(net->ipv4.sysctl_tcp_wmem,
2674 init_net.ipv4.sysctl_tcp_wmem,
2675 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2676 }
6d82aa24 2677 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
9c21d2fc 2678 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
e1cfcbe8 2679 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
43713848 2680 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
3733be14
HY
2681 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2682 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
e1cfcbe8 2683
6670e152
SH
2684 /* Reno is always built in */
2685 if (!net_eq(net, &init_net) &&
2686 try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2687 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2688 else
2689 net->ipv4.tcp_congestion_control = &tcp_reno;
2690
49213555 2691 return 0;
bdbbb852
ED
2692fail:
2693 tcp_sk_exit(net);
2694
2695 return res;
b099ce26
EB
2696}
2697
2698static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2699{
43713848
HY
2700 struct net *net;
2701
1946e672 2702 inet_twsk_purge(&tcp_hashinfo, AF_INET);
43713848
HY
2703
2704 list_for_each_entry(net, net_exit_list, exit_list)
2705 tcp_fastopen_ctx_destroy(net);
046ee902
DL
2706}
2707
2708static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2709 .init = tcp_sk_init,
2710 .exit = tcp_sk_exit,
2711 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2712};
2713
9b0f976f 2714void __init tcp_v4_init(void)
1da177e4 2715{
6a1b3054 2716 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2717 panic("Failed to create the TCP control socket.\n");
1da177e4 2718}