Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec...
[linux-block.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
afd46503 53#define pr_fmt(fmt) "TCP: " fmt
1da177e4 54
eb4dea58 55#include <linux/bottom_half.h>
1da177e4
LT
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
5a0e3ad6 64#include <linux/slab.h>
1da177e4 65
457c4cbc 66#include <net/net_namespace.h>
1da177e4 67#include <net/icmp.h>
304a1618 68#include <net/inet_hashtables.h>
1da177e4 69#include <net/tcp.h>
20380731 70#include <net/transp_v6.h>
1da177e4
LT
71#include <net/ipv6.h>
72#include <net/inet_common.h>
6d6ee43e 73#include <net/timewait_sock.h>
1da177e4 74#include <net/xfrm.h>
1a2449a8 75#include <net/netdma.h>
6e5714ea 76#include <net/secure_seq.h>
d1a4c0b3 77#include <net/tcp_memcontrol.h>
1da177e4
LT
78
79#include <linux/inet.h>
80#include <linux/ipv6.h>
81#include <linux/stddef.h>
82#include <linux/proc_fs.h>
83#include <linux/seq_file.h>
84
cfb6eeb4
YH
85#include <linux/crypto.h>
86#include <linux/scatterlist.h>
87
ab32ea5d
BH
88int sysctl_tcp_tw_reuse __read_mostly;
89int sysctl_tcp_low_latency __read_mostly;
4bc2f18b 90EXPORT_SYMBOL(sysctl_tcp_low_latency);
1da177e4 91
1da177e4 92
cfb6eeb4 93#ifdef CONFIG_TCP_MD5SIG
a915da9b 94static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 95 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
96#endif
97
5caea4ea 98struct inet_hashinfo tcp_hashinfo;
4bc2f18b 99EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 100
cf533ea5 101static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
1da177e4 102{
eddc9ec5
ACM
103 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
104 ip_hdr(skb)->saddr,
aa8223c7
ACM
105 tcp_hdr(skb)->dest,
106 tcp_hdr(skb)->source);
1da177e4
LT
107}
108
6d6ee43e
ACM
109int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110{
111 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112 struct tcp_sock *tp = tcp_sk(sk);
113
114 /* With PAWS, it is safe from the viewpoint
115 of data integrity. Even without PAWS it is safe provided sequence
116 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
117
118 Actually, the idea is close to VJ's one, only timestamp cache is
119 held not per host, but per port pair and TW bucket is used as state
120 holder.
121
122 If TW bucket has been already destroyed we fall back to VJ's scheme
123 and use initial timestamp retrieved from peer table.
124 */
125 if (tcptw->tw_ts_recent_stamp &&
126 (twp == NULL || (sysctl_tcp_tw_reuse &&
9d729f72 127 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
6d6ee43e
ACM
128 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
129 if (tp->write_seq == 0)
130 tp->write_seq = 1;
131 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
132 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
133 sock_hold(sktw);
134 return 1;
135 }
136
137 return 0;
138}
6d6ee43e
ACM
139EXPORT_SYMBOL_GPL(tcp_twsk_unique);
140
ee995283
PE
141static int tcp_repair_connect(struct sock *sk)
142{
143 tcp_connect_init(sk);
144 tcp_finish_connect(sk, NULL);
145
146 return 0;
147}
148
1da177e4
LT
149/* This will initiate an outgoing connection. */
150int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
151{
2d7192d6 152 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
153 struct inet_sock *inet = inet_sk(sk);
154 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 155 __be16 orig_sport, orig_dport;
bada8adc 156 __be32 daddr, nexthop;
da905bd1 157 struct flowi4 *fl4;
2d7192d6 158 struct rtable *rt;
1da177e4 159 int err;
f6d8bd05 160 struct ip_options_rcu *inet_opt;
1da177e4
LT
161
162 if (addr_len < sizeof(struct sockaddr_in))
163 return -EINVAL;
164
165 if (usin->sin_family != AF_INET)
166 return -EAFNOSUPPORT;
167
168 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05
ED
169 inet_opt = rcu_dereference_protected(inet->inet_opt,
170 sock_owned_by_user(sk));
171 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
172 if (!daddr)
173 return -EINVAL;
f6d8bd05 174 nexthop = inet_opt->opt.faddr;
1da177e4
LT
175 }
176
dca8b089
DM
177 orig_sport = inet->inet_sport;
178 orig_dport = usin->sin_port;
da905bd1
DM
179 fl4 = &inet->cork.fl.u.ip4;
180 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
181 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
182 IPPROTO_TCP,
183 orig_sport, orig_dport, sk, true);
184 if (IS_ERR(rt)) {
185 err = PTR_ERR(rt);
186 if (err == -ENETUNREACH)
7c73a6fa 187 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 188 return err;
584bdf8c 189 }
1da177e4
LT
190
191 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
192 ip_rt_put(rt);
193 return -ENETUNREACH;
194 }
195
f6d8bd05 196 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 197 daddr = fl4->daddr;
1da177e4 198
c720c7e8 199 if (!inet->inet_saddr)
da905bd1 200 inet->inet_saddr = fl4->saddr;
c720c7e8 201 inet->inet_rcv_saddr = inet->inet_saddr;
1da177e4 202
c720c7e8 203 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
204 /* Reset inherited state */
205 tp->rx_opt.ts_recent = 0;
206 tp->rx_opt.ts_recent_stamp = 0;
ee995283
PE
207 if (likely(!tp->repair))
208 tp->write_seq = 0;
1da177e4
LT
209 }
210
295ff7ed 211 if (tcp_death_row.sysctl_tw_recycle &&
81166dd6
DM
212 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
213 tcp_fetch_timewait_stamp(sk, &rt->dst);
1da177e4 214
c720c7e8
ED
215 inet->inet_dport = usin->sin_port;
216 inet->inet_daddr = daddr;
1da177e4 217
d83d8461 218 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
219 if (inet_opt)
220 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 221
bee7ca9e 222 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
223
224 /* Socket identity is still unknown (sport may be zero).
225 * However we set state to SYN-SENT and not releasing socket
226 * lock select source port, enter ourselves into the hash tables and
227 * complete initialization after this.
228 */
229 tcp_set_state(sk, TCP_SYN_SENT);
a7f5e7f1 230 err = inet_hash_connect(&tcp_death_row, sk);
1da177e4
LT
231 if (err)
232 goto failure;
233
da905bd1 234 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
235 inet->inet_sport, inet->inet_dport, sk);
236 if (IS_ERR(rt)) {
237 err = PTR_ERR(rt);
238 rt = NULL;
1da177e4 239 goto failure;
b23dd4fe 240 }
1da177e4 241 /* OK, now commit destination to socket. */
bcd76111 242 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 243 sk_setup_caps(sk, &rt->dst);
1da177e4 244
ee995283 245 if (!tp->write_seq && likely(!tp->repair))
c720c7e8
ED
246 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
247 inet->inet_daddr,
248 inet->inet_sport,
1da177e4
LT
249 usin->sin_port);
250
c720c7e8 251 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4 252
ee995283
PE
253 if (likely(!tp->repair))
254 err = tcp_connect(sk);
255 else
256 err = tcp_repair_connect(sk);
257
1da177e4
LT
258 rt = NULL;
259 if (err)
260 goto failure;
261
262 return 0;
263
264failure:
7174259e
ACM
265 /*
266 * This unhashes the socket and releases the local port,
267 * if necessary.
268 */
1da177e4
LT
269 tcp_set_state(sk, TCP_CLOSE);
270 ip_rt_put(rt);
271 sk->sk_route_caps = 0;
c720c7e8 272 inet->inet_dport = 0;
1da177e4
LT
273 return err;
274}
4bc2f18b 275EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 276
1da177e4 277/*
563d34d0
ED
278 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
279 * It can be called through tcp_release_cb() if socket was owned by user
280 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 281 */
563d34d0 282static void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4
LT
283{
284 struct dst_entry *dst;
285 struct inet_sock *inet = inet_sk(sk);
563d34d0 286 u32 mtu = tcp_sk(sk)->mtu_info;
1da177e4
LT
287
288 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
289 * send out by Linux are always <576bytes so they should go through
290 * unfragmented).
291 */
292 if (sk->sk_state == TCP_LISTEN)
293 return;
294
80d0a69f
DM
295 dst = inet_csk_update_pmtu(sk, mtu);
296 if (!dst)
1da177e4
LT
297 return;
298
1da177e4
LT
299 /* Something is about to be wrong... Remember soft error
300 * for the case, if this connection will not able to recover.
301 */
302 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
303 sk->sk_err_soft = EMSGSIZE;
304
305 mtu = dst_mtu(dst);
306
307 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
d83d8461 308 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
309 tcp_sync_mss(sk, mtu);
310
311 /* Resend the TCP packet because it's
312 * clear that the old packet has been
313 * dropped. This is the new "fast" path mtu
314 * discovery.
315 */
316 tcp_simple_retransmit(sk);
317 } /* else let the usual retransmit timer handle it */
318}
319
55be7a9c
DM
320static void do_redirect(struct sk_buff *skb, struct sock *sk)
321{
322 struct dst_entry *dst = __sk_dst_check(sk, 0);
323
1ed5c48f 324 if (dst)
6700c270 325 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
326}
327
1da177e4
LT
328/*
329 * This routine is called by the ICMP module when it gets some
330 * sort of error condition. If err < 0 then the socket should
331 * be closed and the error returned to the user. If err > 0
332 * it's just the icmp type << 8 | icmp code. After adjustment
333 * header points to the first 8 bytes of the tcp header. We need
334 * to find the appropriate port.
335 *
336 * The locking strategy used here is very "optimistic". When
337 * someone else accesses the socket the ICMP is just dropped
338 * and for some paths there is no check at all.
339 * A more general error queue to queue errors for later handling
340 * is probably better.
341 *
342 */
343
4d1a2d9e 344void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 345{
b71d1d42 346 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 347 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 348 struct inet_connection_sock *icsk;
1da177e4
LT
349 struct tcp_sock *tp;
350 struct inet_sock *inet;
4d1a2d9e
DL
351 const int type = icmp_hdr(icmp_skb)->type;
352 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 353 struct sock *sk;
f1ecd5d9 354 struct sk_buff *skb;
168a8f58 355 struct request_sock *req;
1da177e4 356 __u32 seq;
f1ecd5d9 357 __u32 remaining;
1da177e4 358 int err;
4d1a2d9e 359 struct net *net = dev_net(icmp_skb->dev);
1da177e4 360
4d1a2d9e 361 if (icmp_skb->len < (iph->ihl << 2) + 8) {
dcfc23ca 362 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
363 return;
364 }
365
fd54d716 366 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
4d1a2d9e 367 iph->saddr, th->source, inet_iif(icmp_skb));
1da177e4 368 if (!sk) {
dcfc23ca 369 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
370 return;
371 }
372 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 373 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
374 return;
375 }
376
377 bh_lock_sock(sk);
378 /* If too many ICMPs get dropped on busy
379 * servers this needs to be solved differently.
563d34d0
ED
380 * We do take care of PMTU discovery (RFC1191) special case :
381 * we can receive locally generated ICMP messages while socket is held.
1da177e4 382 */
563d34d0
ED
383 if (sock_owned_by_user(sk) &&
384 type != ICMP_DEST_UNREACH &&
385 code != ICMP_FRAG_NEEDED)
de0744af 386 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
1da177e4
LT
387
388 if (sk->sk_state == TCP_CLOSE)
389 goto out;
390
97e3ecd1 391 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
392 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
393 goto out;
394 }
395
f1ecd5d9 396 icsk = inet_csk(sk);
1da177e4 397 tp = tcp_sk(sk);
168a8f58 398 req = tp->fastopen_rsk;
1da177e4
LT
399 seq = ntohl(th->seq);
400 if (sk->sk_state != TCP_LISTEN &&
168a8f58
JC
401 !between(seq, tp->snd_una, tp->snd_nxt) &&
402 (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
403 /* For a Fast Open socket, allow seq to be snt_isn. */
de0744af 404 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
405 goto out;
406 }
407
408 switch (type) {
55be7a9c
DM
409 case ICMP_REDIRECT:
410 do_redirect(icmp_skb, sk);
411 goto out;
1da177e4
LT
412 case ICMP_SOURCE_QUENCH:
413 /* Just silently ignore these. */
414 goto out;
415 case ICMP_PARAMETERPROB:
416 err = EPROTO;
417 break;
418 case ICMP_DEST_UNREACH:
419 if (code > NR_ICMP_UNREACH)
420 goto out;
421
422 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
563d34d0 423 tp->mtu_info = info;
144d56e9 424 if (!sock_owned_by_user(sk)) {
563d34d0 425 tcp_v4_mtu_reduced(sk);
144d56e9
ED
426 } else {
427 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
428 sock_hold(sk);
429 }
1da177e4
LT
430 goto out;
431 }
432
433 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
434 /* check if icmp_skb allows revert of backoff
435 * (see draft-zimmermann-tcp-lcd) */
436 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
437 break;
438 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
439 !icsk->icsk_backoff)
440 break;
441
168a8f58
JC
442 /* XXX (TFO) - revisit the following logic for TFO */
443
8f49c270
DM
444 if (sock_owned_by_user(sk))
445 break;
446
f1ecd5d9 447 icsk->icsk_backoff--;
9ad7c049
JC
448 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
449 TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
f1ecd5d9
DL
450 tcp_bound_rto(sk);
451
452 skb = tcp_write_queue_head(sk);
453 BUG_ON(!skb);
454
455 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
456 tcp_time_stamp - TCP_SKB_CB(skb)->when);
457
458 if (remaining) {
459 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
460 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
461 } else {
462 /* RTO revert clocked out retransmission.
463 * Will retransmit now */
464 tcp_retransmit_timer(sk);
465 }
466
1da177e4
LT
467 break;
468 case ICMP_TIME_EXCEEDED:
469 err = EHOSTUNREACH;
470 break;
471 default:
472 goto out;
473 }
474
168a8f58
JC
475 /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
476 * than following the TCP_SYN_RECV case and closing the socket,
477 * we ignore the ICMP error and keep trying like a fully established
478 * socket. Is this the right thing to do?
479 */
480 if (req && req->sk == NULL)
481 goto out;
482
1da177e4 483 switch (sk->sk_state) {
60236fdd 484 struct request_sock *req, **prev;
1da177e4
LT
485 case TCP_LISTEN:
486 if (sock_owned_by_user(sk))
487 goto out;
488
463c84b9
ACM
489 req = inet_csk_search_req(sk, &prev, th->dest,
490 iph->daddr, iph->saddr);
1da177e4
LT
491 if (!req)
492 goto out;
493
494 /* ICMPs are not backlogged, hence we cannot get
495 an established socket here.
496 */
547b792c 497 WARN_ON(req->sk);
1da177e4 498
2e6599cb 499 if (seq != tcp_rsk(req)->snt_isn) {
de0744af 500 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
501 goto out;
502 }
503
504 /*
505 * Still in SYN_RECV, just remove it silently.
506 * There is no good way to pass the error to the newly
507 * created socket, and POSIX does not want network
508 * errors returned from accept().
509 */
463c84b9 510 inet_csk_reqsk_queue_drop(sk, req, prev);
1da177e4
LT
511 goto out;
512
513 case TCP_SYN_SENT:
514 case TCP_SYN_RECV: /* Cannot happen.
168a8f58
JC
515 It can f.e. if SYNs crossed,
516 or Fast Open.
1da177e4
LT
517 */
518 if (!sock_owned_by_user(sk)) {
1da177e4
LT
519 sk->sk_err = err;
520
521 sk->sk_error_report(sk);
522
523 tcp_done(sk);
524 } else {
525 sk->sk_err_soft = err;
526 }
527 goto out;
528 }
529
530 /* If we've already connected we will keep trying
531 * until we time out, or the user gives up.
532 *
533 * rfc1122 4.2.3.9 allows to consider as hard errors
534 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
535 * but it is obsoleted by pmtu discovery).
536 *
537 * Note, that in modern internet, where routing is unreliable
538 * and in each dark corner broken firewalls sit, sending random
539 * errors ordered by their masters even this two messages finally lose
540 * their original sense (even Linux sends invalid PORT_UNREACHs)
541 *
542 * Now we are in compliance with RFCs.
543 * --ANK (980905)
544 */
545
546 inet = inet_sk(sk);
547 if (!sock_owned_by_user(sk) && inet->recverr) {
548 sk->sk_err = err;
549 sk->sk_error_report(sk);
550 } else { /* Only an error on timeout */
551 sk->sk_err_soft = err;
552 }
553
554out:
555 bh_unlock_sock(sk);
556 sock_put(sk);
557}
558
419f9f89
HX
559static void __tcp_v4_send_check(struct sk_buff *skb,
560 __be32 saddr, __be32 daddr)
1da177e4 561{
aa8223c7 562 struct tcphdr *th = tcp_hdr(skb);
1da177e4 563
84fa7933 564 if (skb->ip_summed == CHECKSUM_PARTIAL) {
419f9f89 565 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
663ead3b 566 skb->csum_start = skb_transport_header(skb) - skb->head;
ff1dcadb 567 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4 568 } else {
419f9f89 569 th->check = tcp_v4_check(skb->len, saddr, daddr,
07f0757a 570 csum_partial(th,
1da177e4
LT
571 th->doff << 2,
572 skb->csum));
573 }
574}
575
419f9f89 576/* This routine computes an IPv4 TCP checksum. */
bb296246 577void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 578{
cf533ea5 579 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
580
581 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
582}
4bc2f18b 583EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 584
a430a43d
HX
585int tcp_v4_gso_send_check(struct sk_buff *skb)
586{
eddc9ec5 587 const struct iphdr *iph;
a430a43d
HX
588 struct tcphdr *th;
589
590 if (!pskb_may_pull(skb, sizeof(*th)))
591 return -EINVAL;
592
eddc9ec5 593 iph = ip_hdr(skb);
aa8223c7 594 th = tcp_hdr(skb);
a430a43d
HX
595
596 th->check = 0;
84fa7933 597 skb->ip_summed = CHECKSUM_PARTIAL;
419f9f89 598 __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
a430a43d
HX
599 return 0;
600}
601
1da177e4
LT
602/*
603 * This routine will send an RST to the other tcp.
604 *
605 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
606 * for reset.
607 * Answer: if a packet caused RST, it is not for a socket
608 * existing in our system, if it is matched to a socket,
609 * it is just duplicate segment or bug in other side's TCP.
610 * So that we build reply only basing on parameters
611 * arrived with segment.
612 * Exception: precedence violation. We do not implement it in any case.
613 */
614
cfb6eeb4 615static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
1da177e4 616{
cf533ea5 617 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
618 struct {
619 struct tcphdr th;
620#ifdef CONFIG_TCP_MD5SIG
714e85be 621 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
622#endif
623 } rep;
1da177e4 624 struct ip_reply_arg arg;
cfb6eeb4
YH
625#ifdef CONFIG_TCP_MD5SIG
626 struct tcp_md5sig_key *key;
658ddaaf
SL
627 const __u8 *hash_location = NULL;
628 unsigned char newhash[16];
629 int genhash;
630 struct sock *sk1 = NULL;
cfb6eeb4 631#endif
a86b1e30 632 struct net *net;
1da177e4
LT
633
634 /* Never send a reset in response to a reset. */
635 if (th->rst)
636 return;
637
511c3f92 638 if (skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
639 return;
640
641 /* Swap the send and the receive. */
cfb6eeb4
YH
642 memset(&rep, 0, sizeof(rep));
643 rep.th.dest = th->source;
644 rep.th.source = th->dest;
645 rep.th.doff = sizeof(struct tcphdr) / 4;
646 rep.th.rst = 1;
1da177e4
LT
647
648 if (th->ack) {
cfb6eeb4 649 rep.th.seq = th->ack_seq;
1da177e4 650 } else {
cfb6eeb4
YH
651 rep.th.ack = 1;
652 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
653 skb->len - (th->doff << 2));
1da177e4
LT
654 }
655
7174259e 656 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
657 arg.iov[0].iov_base = (unsigned char *)&rep;
658 arg.iov[0].iov_len = sizeof(rep.th);
659
660#ifdef CONFIG_TCP_MD5SIG
658ddaaf
SL
661 hash_location = tcp_parse_md5sig_option(th);
662 if (!sk && hash_location) {
663 /*
664 * active side is lost. Try to find listening socket through
665 * source port, and then find md5 key through listening socket.
666 * we are not loose security here:
667 * Incoming packet is checked with md5 hash with finding key,
668 * no RST generated if md5 hash doesn't match.
669 */
670 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
671 &tcp_hashinfo, ip_hdr(skb)->daddr,
672 ntohs(th->source), inet_iif(skb));
673 /* don't send rst if it can't find key */
674 if (!sk1)
675 return;
676 rcu_read_lock();
677 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
678 &ip_hdr(skb)->saddr, AF_INET);
679 if (!key)
680 goto release_sk1;
681
682 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
683 if (genhash || memcmp(hash_location, newhash, 16) != 0)
684 goto release_sk1;
685 } else {
686 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
687 &ip_hdr(skb)->saddr,
688 AF_INET) : NULL;
689 }
690
cfb6eeb4
YH
691 if (key) {
692 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
693 (TCPOPT_NOP << 16) |
694 (TCPOPT_MD5SIG << 8) |
695 TCPOLEN_MD5SIG);
696 /* Update length and the length the header thinks exists */
697 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
698 rep.th.doff = arg.iov[0].iov_len / 4;
699
49a72dfb 700 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
701 key, ip_hdr(skb)->saddr,
702 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
703 }
704#endif
eddc9ec5
ACM
705 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
706 ip_hdr(skb)->saddr, /* XXX */
52cd5750 707 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 708 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
88ef4a5a 709 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
e2446eaa 710 /* When socket is gone, all binding information is lost.
4c675258
AK
711 * routing might fail in this case. No choice here, if we choose to force
712 * input interface, we will misroute in case of asymmetric route.
e2446eaa 713 */
4c675258
AK
714 if (sk)
715 arg.bound_dev_if = sk->sk_bound_dev_if;
1da177e4 716
adf30907 717 net = dev_net(skb_dst(skb)->dev);
66b13d99 718 arg.tos = ip_hdr(skb)->tos;
be9f4a44 719 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
70e73416 720 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
1da177e4 721
63231bdd
PE
722 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
723 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
658ddaaf
SL
724
725#ifdef CONFIG_TCP_MD5SIG
726release_sk1:
727 if (sk1) {
728 rcu_read_unlock();
729 sock_put(sk1);
730 }
731#endif
1da177e4
LT
732}
733
734/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
735 outside socket context is ugly, certainly. What can I do?
736 */
737
9501f972
YH
738static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
739 u32 win, u32 ts, int oif,
88ef4a5a 740 struct tcp_md5sig_key *key,
66b13d99 741 int reply_flags, u8 tos)
1da177e4 742{
cf533ea5 743 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
744 struct {
745 struct tcphdr th;
714e85be 746 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 747#ifdef CONFIG_TCP_MD5SIG
714e85be 748 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
749#endif
750 ];
1da177e4
LT
751 } rep;
752 struct ip_reply_arg arg;
adf30907 753 struct net *net = dev_net(skb_dst(skb)->dev);
1da177e4
LT
754
755 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 756 memset(&arg, 0, sizeof(arg));
1da177e4
LT
757
758 arg.iov[0].iov_base = (unsigned char *)&rep;
759 arg.iov[0].iov_len = sizeof(rep.th);
760 if (ts) {
cfb6eeb4
YH
761 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
762 (TCPOPT_TIMESTAMP << 8) |
763 TCPOLEN_TIMESTAMP);
764 rep.opt[1] = htonl(tcp_time_stamp);
765 rep.opt[2] = htonl(ts);
cb48cfe8 766 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
767 }
768
769 /* Swap the send and the receive. */
770 rep.th.dest = th->source;
771 rep.th.source = th->dest;
772 rep.th.doff = arg.iov[0].iov_len / 4;
773 rep.th.seq = htonl(seq);
774 rep.th.ack_seq = htonl(ack);
775 rep.th.ack = 1;
776 rep.th.window = htons(win);
777
cfb6eeb4 778#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4
YH
779 if (key) {
780 int offset = (ts) ? 3 : 0;
781
782 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
783 (TCPOPT_NOP << 16) |
784 (TCPOPT_MD5SIG << 8) |
785 TCPOLEN_MD5SIG);
786 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
787 rep.th.doff = arg.iov[0].iov_len/4;
788
49a72dfb 789 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
790 key, ip_hdr(skb)->saddr,
791 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
792 }
793#endif
88ef4a5a 794 arg.flags = reply_flags;
eddc9ec5
ACM
795 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
796 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
797 arg.iov[0].iov_len, IPPROTO_TCP, 0);
798 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
799 if (oif)
800 arg.bound_dev_if = oif;
66b13d99 801 arg.tos = tos;
be9f4a44 802 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
70e73416 803 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
1da177e4 804
63231bdd 805 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
1da177e4
LT
806}
807
808static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
809{
8feaf0c0 810 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 811 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 812
9501f972 813 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 814 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
9501f972
YH
815 tcptw->tw_ts_recent,
816 tw->tw_bound_dev_if,
88ef4a5a 817 tcp_twsk_md5_key(tcptw),
66b13d99
ED
818 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
819 tw->tw_tos
9501f972 820 );
1da177e4 821
8feaf0c0 822 inet_twsk_put(tw);
1da177e4
LT
823}
824
6edafaaf 825static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
7174259e 826 struct request_sock *req)
1da177e4 827{
168a8f58
JC
828 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
829 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
830 */
831 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
832 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
833 tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
9501f972
YH
834 req->ts_recent,
835 0,
a915da9b
ED
836 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
837 AF_INET),
66b13d99
ED
838 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
839 ip_hdr(skb)->tos);
1da177e4
LT
840}
841
1da177e4 842/*
9bf1d83e 843 * Send a SYN-ACK after having received a SYN.
60236fdd 844 * This still operates on a request_sock only, not on a big
1da177e4
LT
845 * socket.
846 */
72659ecc
OP
847static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
848 struct request_sock *req,
fff32699 849 struct request_values *rvp,
7586eceb
ED
850 u16 queue_mapping,
851 bool nocache)
1da177e4 852{
2e6599cb 853 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 854 struct flowi4 fl4;
1da177e4
LT
855 int err = -1;
856 struct sk_buff * skb;
857
858 /* First, grab a route. */
ba3f7f04 859 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 860 return -1;
1da177e4 861
8336886f 862 skb = tcp_make_synack(sk, dst, req, rvp, NULL);
1da177e4
LT
863
864 if (skb) {
419f9f89 865 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
1da177e4 866
fff32699 867 skb_set_queue_mapping(skb, queue_mapping);
2e6599cb
ACM
868 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
869 ireq->rmt_addr,
870 ireq->opt);
b9df3cb8 871 err = net_xmit_eval(err);
016818d0
NC
872 if (!tcp_rsk(req)->snt_synack && !err)
873 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1da177e4
LT
874 }
875
1da177e4
LT
876 return err;
877}
878
72659ecc 879static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
e6c022a4 880 struct request_values *rvp)
fd80eb94 881{
e6c022a4
ED
882 int res = tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
883
884 if (!res)
885 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
886 return res;
fd80eb94
DL
887}
888
1da177e4 889/*
60236fdd 890 * IPv4 request_sock destructor.
1da177e4 891 */
60236fdd 892static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 893{
a51482bd 894 kfree(inet_rsk(req)->opt);
1da177e4
LT
895}
896
946cedcc 897/*
a2a385d6 898 * Return true if a syncookie should be sent
946cedcc 899 */
a2a385d6 900bool tcp_syn_flood_action(struct sock *sk,
946cedcc
ED
901 const struct sk_buff *skb,
902 const char *proto)
1da177e4 903{
946cedcc 904 const char *msg = "Dropping request";
a2a385d6 905 bool want_cookie = false;
946cedcc
ED
906 struct listen_sock *lopt;
907
908
1da177e4 909
2a1d4bd4 910#ifdef CONFIG_SYN_COOKIES
946cedcc 911 if (sysctl_tcp_syncookies) {
2a1d4bd4 912 msg = "Sending cookies";
a2a385d6 913 want_cookie = true;
946cedcc
ED
914 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
915 } else
80e40daa 916#endif
946cedcc
ED
917 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
918
919 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
920 if (!lopt->synflood_warned) {
921 lopt->synflood_warned = 1;
afd46503 922 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
946cedcc
ED
923 proto, ntohs(tcp_hdr(skb)->dest), msg);
924 }
925 return want_cookie;
2a1d4bd4 926}
946cedcc 927EXPORT_SYMBOL(tcp_syn_flood_action);
1da177e4
LT
928
929/*
60236fdd 930 * Save and compile IPv4 options into the request_sock if needed.
1da177e4 931 */
5dff747b 932static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
1da177e4 933{
f6d8bd05
ED
934 const struct ip_options *opt = &(IPCB(skb)->opt);
935 struct ip_options_rcu *dopt = NULL;
1da177e4
LT
936
937 if (opt && opt->optlen) {
f6d8bd05
ED
938 int opt_size = sizeof(*dopt) + opt->optlen;
939
1da177e4
LT
940 dopt = kmalloc(opt_size, GFP_ATOMIC);
941 if (dopt) {
f6d8bd05 942 if (ip_options_echo(&dopt->opt, skb)) {
1da177e4
LT
943 kfree(dopt);
944 dopt = NULL;
945 }
946 }
947 }
948 return dopt;
949}
950
cfb6eeb4
YH
951#ifdef CONFIG_TCP_MD5SIG
952/*
953 * RFC2385 MD5 checksumming requires a mapping of
954 * IP address->MD5 Key.
955 * We need to maintain these in the sk structure.
956 */
957
958/* Find the Key structure for an address. */
a915da9b
ED
959struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
960 const union tcp_md5_addr *addr,
961 int family)
cfb6eeb4
YH
962{
963 struct tcp_sock *tp = tcp_sk(sk);
a915da9b
ED
964 struct tcp_md5sig_key *key;
965 struct hlist_node *pos;
966 unsigned int size = sizeof(struct in_addr);
a8afca03 967 struct tcp_md5sig_info *md5sig;
cfb6eeb4 968
a8afca03
ED
969 /* caller either holds rcu_read_lock() or socket lock */
970 md5sig = rcu_dereference_check(tp->md5sig_info,
b4fb05ea
ED
971 sock_owned_by_user(sk) ||
972 lockdep_is_held(&sk->sk_lock.slock));
a8afca03 973 if (!md5sig)
cfb6eeb4 974 return NULL;
a915da9b
ED
975#if IS_ENABLED(CONFIG_IPV6)
976 if (family == AF_INET6)
977 size = sizeof(struct in6_addr);
978#endif
a8afca03 979 hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
a915da9b
ED
980 if (key->family != family)
981 continue;
982 if (!memcmp(&key->addr, addr, size))
983 return key;
cfb6eeb4
YH
984 }
985 return NULL;
986}
a915da9b 987EXPORT_SYMBOL(tcp_md5_do_lookup);
cfb6eeb4
YH
988
989struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
990 struct sock *addr_sk)
991{
a915da9b
ED
992 union tcp_md5_addr *addr;
993
994 addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
995 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4 996}
cfb6eeb4
YH
997EXPORT_SYMBOL(tcp_v4_md5_lookup);
998
f5b99bcd
AB
999static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
1000 struct request_sock *req)
cfb6eeb4 1001{
a915da9b
ED
1002 union tcp_md5_addr *addr;
1003
1004 addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
1005 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4
YH
1006}
1007
1008/* This can be called on a newly created socket, from other files */
a915da9b
ED
1009int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1010 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
cfb6eeb4
YH
1011{
1012 /* Add Key to the list */
b0a713e9 1013 struct tcp_md5sig_key *key;
cfb6eeb4 1014 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1015 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1016
a915da9b 1017 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
cfb6eeb4
YH
1018 if (key) {
1019 /* Pre-existing entry - just update that one. */
a915da9b 1020 memcpy(key->key, newkey, newkeylen);
b0a713e9 1021 key->keylen = newkeylen;
a915da9b
ED
1022 return 0;
1023 }
260fcbeb 1024
a8afca03
ED
1025 md5sig = rcu_dereference_protected(tp->md5sig_info,
1026 sock_owned_by_user(sk));
a915da9b
ED
1027 if (!md5sig) {
1028 md5sig = kmalloc(sizeof(*md5sig), gfp);
1029 if (!md5sig)
cfb6eeb4 1030 return -ENOMEM;
cfb6eeb4 1031
a915da9b
ED
1032 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1033 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 1034 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 1035 }
cfb6eeb4 1036
5f3d9cb2 1037 key = sock_kmalloc(sk, sizeof(*key), gfp);
a915da9b
ED
1038 if (!key)
1039 return -ENOMEM;
1040 if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
5f3d9cb2 1041 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 1042 return -ENOMEM;
cfb6eeb4 1043 }
a915da9b
ED
1044
1045 memcpy(key->key, newkey, newkeylen);
1046 key->keylen = newkeylen;
1047 key->family = family;
1048 memcpy(&key->addr, addr,
1049 (family == AF_INET6) ? sizeof(struct in6_addr) :
1050 sizeof(struct in_addr));
1051 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
1052 return 0;
1053}
a915da9b 1054EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 1055
a915da9b 1056int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
cfb6eeb4
YH
1057{
1058 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1059 struct tcp_md5sig_key *key;
a8afca03 1060 struct tcp_md5sig_info *md5sig;
a915da9b
ED
1061
1062 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1063 if (!key)
1064 return -ENOENT;
1065 hlist_del_rcu(&key->node);
5f3d9cb2 1066 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1067 kfree_rcu(key, rcu);
a8afca03
ED
1068 md5sig = rcu_dereference_protected(tp->md5sig_info,
1069 sock_owned_by_user(sk));
1070 if (hlist_empty(&md5sig->head))
a915da9b
ED
1071 tcp_free_md5sig_pool();
1072 return 0;
cfb6eeb4 1073}
a915da9b 1074EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 1075
e0683e70 1076static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
1077{
1078 struct tcp_sock *tp = tcp_sk(sk);
a915da9b
ED
1079 struct tcp_md5sig_key *key;
1080 struct hlist_node *pos, *n;
a8afca03 1081 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1082
a8afca03
ED
1083 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1084
1085 if (!hlist_empty(&md5sig->head))
cfb6eeb4 1086 tcp_free_md5sig_pool();
a8afca03 1087 hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
a915da9b 1088 hlist_del_rcu(&key->node);
5f3d9cb2 1089 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1090 kfree_rcu(key, rcu);
cfb6eeb4
YH
1091 }
1092}
1093
7174259e
ACM
1094static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1095 int optlen)
cfb6eeb4
YH
1096{
1097 struct tcp_md5sig cmd;
1098 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
cfb6eeb4
YH
1099
1100 if (optlen < sizeof(cmd))
1101 return -EINVAL;
1102
7174259e 1103 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1104 return -EFAULT;
1105
1106 if (sin->sin_family != AF_INET)
1107 return -EINVAL;
1108
a8afca03 1109 if (!cmd.tcpm_key || !cmd.tcpm_keylen)
a915da9b
ED
1110 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1111 AF_INET);
cfb6eeb4
YH
1112
1113 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1114 return -EINVAL;
1115
a915da9b
ED
1116 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1117 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1118 GFP_KERNEL);
cfb6eeb4
YH
1119}
1120
49a72dfb
AL
1121static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1122 __be32 daddr, __be32 saddr, int nbytes)
cfb6eeb4 1123{
cfb6eeb4 1124 struct tcp4_pseudohdr *bp;
49a72dfb 1125 struct scatterlist sg;
cfb6eeb4
YH
1126
1127 bp = &hp->md5_blk.ip4;
cfb6eeb4
YH
1128
1129 /*
49a72dfb 1130 * 1. the TCP pseudo-header (in the order: source IP address,
cfb6eeb4
YH
1131 * destination IP address, zero-padded protocol number, and
1132 * segment length)
1133 */
1134 bp->saddr = saddr;
1135 bp->daddr = daddr;
1136 bp->pad = 0;
076fb722 1137 bp->protocol = IPPROTO_TCP;
49a72dfb 1138 bp->len = cpu_to_be16(nbytes);
c7da57a1 1139
49a72dfb
AL
1140 sg_init_one(&sg, bp, sizeof(*bp));
1141 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1142}
1143
a915da9b 1144static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1145 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1146{
1147 struct tcp_md5sig_pool *hp;
1148 struct hash_desc *desc;
1149
1150 hp = tcp_get_md5sig_pool();
1151 if (!hp)
1152 goto clear_hash_noput;
1153 desc = &hp->md5_desc;
1154
1155 if (crypto_hash_init(desc))
1156 goto clear_hash;
1157 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1158 goto clear_hash;
1159 if (tcp_md5_hash_header(hp, th))
1160 goto clear_hash;
1161 if (tcp_md5_hash_key(hp, key))
1162 goto clear_hash;
1163 if (crypto_hash_final(desc, md5_hash))
cfb6eeb4
YH
1164 goto clear_hash;
1165
cfb6eeb4 1166 tcp_put_md5sig_pool();
cfb6eeb4 1167 return 0;
49a72dfb 1168
cfb6eeb4
YH
1169clear_hash:
1170 tcp_put_md5sig_pool();
1171clear_hash_noput:
1172 memset(md5_hash, 0, 16);
49a72dfb 1173 return 1;
cfb6eeb4
YH
1174}
1175
49a72dfb 1176int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
318cf7aa
ED
1177 const struct sock *sk, const struct request_sock *req,
1178 const struct sk_buff *skb)
cfb6eeb4 1179{
49a72dfb
AL
1180 struct tcp_md5sig_pool *hp;
1181 struct hash_desc *desc;
318cf7aa 1182 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1183 __be32 saddr, daddr;
1184
1185 if (sk) {
c720c7e8
ED
1186 saddr = inet_sk(sk)->inet_saddr;
1187 daddr = inet_sk(sk)->inet_daddr;
49a72dfb
AL
1188 } else if (req) {
1189 saddr = inet_rsk(req)->loc_addr;
1190 daddr = inet_rsk(req)->rmt_addr;
cfb6eeb4 1191 } else {
49a72dfb
AL
1192 const struct iphdr *iph = ip_hdr(skb);
1193 saddr = iph->saddr;
1194 daddr = iph->daddr;
cfb6eeb4 1195 }
49a72dfb
AL
1196
1197 hp = tcp_get_md5sig_pool();
1198 if (!hp)
1199 goto clear_hash_noput;
1200 desc = &hp->md5_desc;
1201
1202 if (crypto_hash_init(desc))
1203 goto clear_hash;
1204
1205 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1206 goto clear_hash;
1207 if (tcp_md5_hash_header(hp, th))
1208 goto clear_hash;
1209 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1210 goto clear_hash;
1211 if (tcp_md5_hash_key(hp, key))
1212 goto clear_hash;
1213 if (crypto_hash_final(desc, md5_hash))
1214 goto clear_hash;
1215
1216 tcp_put_md5sig_pool();
1217 return 0;
1218
1219clear_hash:
1220 tcp_put_md5sig_pool();
1221clear_hash_noput:
1222 memset(md5_hash, 0, 16);
1223 return 1;
cfb6eeb4 1224}
49a72dfb 1225EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1226
a2a385d6 1227static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
cfb6eeb4
YH
1228{
1229 /*
1230 * This gets called for each TCP segment that arrives
1231 * so we want to be efficient.
1232 * We have 3 drop cases:
1233 * o No MD5 hash and one expected.
1234 * o MD5 hash and we're not expecting one.
1235 * o MD5 hash and its wrong.
1236 */
cf533ea5 1237 const __u8 *hash_location = NULL;
cfb6eeb4 1238 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1239 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1240 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1241 int genhash;
cfb6eeb4
YH
1242 unsigned char newhash[16];
1243
a915da9b
ED
1244 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1245 AF_INET);
7d5d5525 1246 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1247
cfb6eeb4
YH
1248 /* We've parsed the options - do we have a hash? */
1249 if (!hash_expected && !hash_location)
a2a385d6 1250 return false;
cfb6eeb4
YH
1251
1252 if (hash_expected && !hash_location) {
785957d3 1253 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1254 return true;
cfb6eeb4
YH
1255 }
1256
1257 if (!hash_expected && hash_location) {
785957d3 1258 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1259 return true;
cfb6eeb4
YH
1260 }
1261
1262 /* Okay, so this is hash_expected and hash_location -
1263 * so we need to calculate the checksum.
1264 */
49a72dfb
AL
1265 genhash = tcp_v4_md5_hash_skb(newhash,
1266 hash_expected,
1267 NULL, NULL, skb);
cfb6eeb4
YH
1268
1269 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
e87cc472
JP
1270 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1271 &iph->saddr, ntohs(th->source),
1272 &iph->daddr, ntohs(th->dest),
1273 genhash ? " tcp_v4_calc_md5_hash failed"
1274 : "");
a2a385d6 1275 return true;
cfb6eeb4 1276 }
a2a385d6 1277 return false;
cfb6eeb4
YH
1278}
1279
1280#endif
1281
72a3effa 1282struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1283 .family = PF_INET,
2e6599cb 1284 .obj_size = sizeof(struct tcp_request_sock),
72659ecc 1285 .rtx_syn_ack = tcp_v4_rtx_synack,
60236fdd
ACM
1286 .send_ack = tcp_v4_reqsk_send_ack,
1287 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1288 .send_reset = tcp_v4_send_reset,
72659ecc 1289 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1290};
1291
cfb6eeb4 1292#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1293static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
cfb6eeb4 1294 .md5_lookup = tcp_v4_reqsk_md5_lookup,
e3afe7b7 1295 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 1296};
b6332e6c 1297#endif
cfb6eeb4 1298
168a8f58
JC
1299static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1300 struct request_sock *req,
1301 struct tcp_fastopen_cookie *foc,
1302 struct tcp_fastopen_cookie *valid_foc)
1303{
1304 bool skip_cookie = false;
1305 struct fastopen_queue *fastopenq;
1306
1307 if (likely(!fastopen_cookie_present(foc))) {
1308 /* See include/net/tcp.h for the meaning of these knobs */
1309 if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1310 ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1311 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1312 skip_cookie = true; /* no cookie to validate */
1313 else
1314 return false;
1315 }
1316 fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1317 /* A FO option is present; bump the counter. */
1318 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1319
1320 /* Make sure the listener has enabled fastopen, and we don't
1321 * exceed the max # of pending TFO requests allowed before trying
1322 * to validating the cookie in order to avoid burning CPU cycles
1323 * unnecessarily.
1324 *
1325 * XXX (TFO) - The implication of checking the max_qlen before
1326 * processing a cookie request is that clients can't differentiate
1327 * between qlen overflow causing Fast Open to be disabled
1328 * temporarily vs a server not supporting Fast Open at all.
1329 */
1330 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1331 fastopenq == NULL || fastopenq->max_qlen == 0)
1332 return false;
1333
1334 if (fastopenq->qlen >= fastopenq->max_qlen) {
1335 struct request_sock *req1;
1336 spin_lock(&fastopenq->lock);
1337 req1 = fastopenq->rskq_rst_head;
1338 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1339 spin_unlock(&fastopenq->lock);
1340 NET_INC_STATS_BH(sock_net(sk),
1341 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1342 /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1343 foc->len = -1;
1344 return false;
1345 }
1346 fastopenq->rskq_rst_head = req1->dl_next;
1347 fastopenq->qlen--;
1348 spin_unlock(&fastopenq->lock);
1349 reqsk_free(req1);
1350 }
1351 if (skip_cookie) {
1352 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1353 return true;
1354 }
1355 if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1356 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1357 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1358 if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1359 memcmp(&foc->val[0], &valid_foc->val[0],
1360 TCP_FASTOPEN_COOKIE_SIZE) != 0)
1361 return false;
1362 valid_foc->len = -1;
1363 }
1364 /* Acknowledge the data received from the peer. */
1365 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1366 return true;
1367 } else if (foc->len == 0) { /* Client requesting a cookie */
1368 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1369 NET_INC_STATS_BH(sock_net(sk),
1370 LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1371 } else {
1372 /* Client sent a cookie with wrong size. Treat it
1373 * the same as invalid and return a valid one.
1374 */
1375 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1376 }
1377 return false;
1378}
1379
1380static int tcp_v4_conn_req_fastopen(struct sock *sk,
1381 struct sk_buff *skb,
1382 struct sk_buff *skb_synack,
1383 struct request_sock *req,
1384 struct request_values *rvp)
1385{
1386 struct tcp_sock *tp = tcp_sk(sk);
1387 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1388 const struct inet_request_sock *ireq = inet_rsk(req);
1389 struct sock *child;
016818d0 1390 int err;
168a8f58 1391
e6c022a4
ED
1392 req->num_retrans = 0;
1393 req->num_timeout = 0;
168a8f58
JC
1394 req->sk = NULL;
1395
1396 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1397 if (child == NULL) {
1398 NET_INC_STATS_BH(sock_net(sk),
1399 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1400 kfree_skb(skb_synack);
1401 return -1;
1402 }
016818d0
NC
1403 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1404 ireq->rmt_addr, ireq->opt);
1405 err = net_xmit_eval(err);
1406 if (!err)
1407 tcp_rsk(req)->snt_synack = tcp_time_stamp;
168a8f58
JC
1408 /* XXX (TFO) - is it ok to ignore error and continue? */
1409
1410 spin_lock(&queue->fastopenq->lock);
1411 queue->fastopenq->qlen++;
1412 spin_unlock(&queue->fastopenq->lock);
1413
1414 /* Initialize the child socket. Have to fix some values to take
1415 * into account the child is a Fast Open socket and is created
1416 * only out of the bits carried in the SYN packet.
1417 */
1418 tp = tcp_sk(child);
1419
1420 tp->fastopen_rsk = req;
1421 /* Do a hold on the listner sk so that if the listener is being
1422 * closed, the child that has been accepted can live on and still
1423 * access listen_lock.
1424 */
1425 sock_hold(sk);
1426 tcp_rsk(req)->listener = sk;
1427
1428 /* RFC1323: The window in SYN & SYN/ACK segments is never
1429 * scaled. So correct it appropriately.
1430 */
1431 tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1432
1433 /* Activate the retrans timer so that SYNACK can be retransmitted.
1434 * The request socket is not added to the SYN table of the parent
1435 * because it's been added to the accept queue directly.
1436 */
1437 inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1438 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1439
1440 /* Add the child socket directly into the accept queue */
1441 inet_csk_reqsk_queue_add(sk, req, child);
1442
1443 /* Now finish processing the fastopen child socket. */
1444 inet_csk(child)->icsk_af_ops->rebuild_header(child);
1445 tcp_init_congestion_control(child);
1446 tcp_mtup_init(child);
1447 tcp_init_buffer_space(child);
1448 tcp_init_metrics(child);
1449
1450 /* Queue the data carried in the SYN packet. We need to first
1451 * bump skb's refcnt because the caller will attempt to free it.
1452 *
1453 * XXX (TFO) - we honor a zero-payload TFO request for now.
1454 * (Any reason not to?)
1455 */
1456 if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1457 /* Don't queue the skb if there is no payload in SYN.
1458 * XXX (TFO) - How about SYN+FIN?
1459 */
1460 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1461 } else {
1462 skb = skb_get(skb);
1463 skb_dst_drop(skb);
1464 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1465 skb_set_owner_r(skb, child);
1466 __skb_queue_tail(&child->sk_receive_queue, skb);
1467 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
6f73601e 1468 tp->syn_data_acked = 1;
168a8f58
JC
1469 }
1470 sk->sk_data_ready(sk, 0);
1471 bh_unlock_sock(child);
1472 sock_put(child);
1473 WARN_ON(req->sk == NULL);
1474 return 0;
1475}
1476
1da177e4
LT
1477int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1478{
4957faad 1479 struct tcp_extend_values tmp_ext;
1da177e4 1480 struct tcp_options_received tmp_opt;
cf533ea5 1481 const u8 *hash_location;
60236fdd 1482 struct request_sock *req;
e6b4d113 1483 struct inet_request_sock *ireq;
4957faad 1484 struct tcp_sock *tp = tcp_sk(sk);
e6b4d113 1485 struct dst_entry *dst = NULL;
eddc9ec5
ACM
1486 __be32 saddr = ip_hdr(skb)->saddr;
1487 __be32 daddr = ip_hdr(skb)->daddr;
1da177e4 1488 __u32 isn = TCP_SKB_CB(skb)->when;
a2a385d6 1489 bool want_cookie = false;
168a8f58
JC
1490 struct flowi4 fl4;
1491 struct tcp_fastopen_cookie foc = { .len = -1 };
1492 struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1493 struct sk_buff *skb_synack;
1494 int do_fastopen;
1da177e4
LT
1495
1496 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1497 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1498 goto drop;
1499
1500 /* TW buckets are converted to open requests without
1501 * limitations, they conserve resources and peer is
1502 * evidently real one.
1503 */
463c84b9 1504 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
946cedcc
ED
1505 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1506 if (!want_cookie)
1507 goto drop;
1da177e4
LT
1508 }
1509
1510 /* Accept backlog is full. If we have already queued enough
1511 * of warm entries in syn queue, drop request. It is better than
1512 * clogging syn queue with openreqs with exponentially increasing
1513 * timeout.
1514 */
463c84b9 1515 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1da177e4
LT
1516 goto drop;
1517
ce4a7d0d 1518 req = inet_reqsk_alloc(&tcp_request_sock_ops);
1da177e4
LT
1519 if (!req)
1520 goto drop;
1521
cfb6eeb4
YH
1522#ifdef CONFIG_TCP_MD5SIG
1523 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1524#endif
1525
1da177e4 1526 tcp_clear_options(&tmp_opt);
bee7ca9e 1527 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
4957faad 1528 tmp_opt.user_mss = tp->rx_opt.user_mss;
168a8f58
JC
1529 tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
1530 want_cookie ? NULL : &foc);
4957faad
WAS
1531
1532 if (tmp_opt.cookie_plus > 0 &&
1533 tmp_opt.saw_tstamp &&
1534 !tp->rx_opt.cookie_out_never &&
1535 (sysctl_tcp_cookie_size > 0 ||
1536 (tp->cookie_values != NULL &&
1537 tp->cookie_values->cookie_desired > 0))) {
1538 u8 *c;
1539 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1540 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1541
1542 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1543 goto drop_and_release;
1544
1545 /* Secret recipe starts with IP addresses */
0eae88f3
ED
1546 *mess++ ^= (__force u32)daddr;
1547 *mess++ ^= (__force u32)saddr;
1da177e4 1548
4957faad
WAS
1549 /* plus variable length Initiator Cookie */
1550 c = (u8 *)mess;
1551 while (l-- > 0)
1552 *c++ ^= *hash_location++;
1553
a2a385d6 1554 want_cookie = false; /* not our kind of cookie */
4957faad
WAS
1555 tmp_ext.cookie_out_never = 0; /* false */
1556 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1557 } else if (!tp->rx_opt.cookie_in_always) {
1558 /* redundant indications, but ensure initialization. */
1559 tmp_ext.cookie_out_never = 1; /* true */
1560 tmp_ext.cookie_plus = 0;
1561 } else {
1562 goto drop_and_release;
1563 }
1564 tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1da177e4 1565
4dfc2817 1566 if (want_cookie && !tmp_opt.saw_tstamp)
1da177e4 1567 tcp_clear_options(&tmp_opt);
1da177e4 1568
1da177e4 1569 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1da177e4
LT
1570 tcp_openreq_init(req, &tmp_opt, skb);
1571
bb5b7c11
DM
1572 ireq = inet_rsk(req);
1573 ireq->loc_addr = daddr;
1574 ireq->rmt_addr = saddr;
1575 ireq->no_srccheck = inet_sk(sk)->transparent;
5dff747b 1576 ireq->opt = tcp_v4_save_options(skb);
bb5b7c11 1577
284904aa 1578 if (security_inet_conn_request(sk, skb, req))
bb5b7c11 1579 goto drop_and_free;
284904aa 1580
172d69e6 1581 if (!want_cookie || tmp_opt.tstamp_ok)
bd14b1b2 1582 TCP_ECN_create_request(req, skb);
1da177e4
LT
1583
1584 if (want_cookie) {
1da177e4 1585 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
172d69e6 1586 req->cookie_ts = tmp_opt.tstamp_ok;
1da177e4 1587 } else if (!isn) {
1da177e4
LT
1588 /* VJ's idea. We save last timestamp seen
1589 * from the destination in peer table, when entering
1590 * state TIME-WAIT, and check against it before
1591 * accepting new connection request.
1592 *
1593 * If "isn" is not zero, this request hit alive
1594 * timewait bucket, so that all the necessary checks
1595 * are made in the function processing timewait state.
1596 */
1597 if (tmp_opt.saw_tstamp &&
295ff7ed 1598 tcp_death_row.sysctl_tw_recycle &&
ba3f7f04 1599 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
81166dd6
DM
1600 fl4.daddr == saddr) {
1601 if (!tcp_peer_is_proven(req, dst, true)) {
de0744af 1602 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
7cd04fa7 1603 goto drop_and_release;
1da177e4
LT
1604 }
1605 }
1606 /* Kill the following clause, if you dislike this way. */
1607 else if (!sysctl_tcp_syncookies &&
463c84b9 1608 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1da177e4 1609 (sysctl_max_syn_backlog >> 2)) &&
81166dd6 1610 !tcp_peer_is_proven(req, dst, false)) {
1da177e4
LT
1611 /* Without syncookies last quarter of
1612 * backlog is filled with destinations,
1613 * proven to be alive.
1614 * It means that we continue to communicate
1615 * to destinations, already remembered
1616 * to the moment of synflood.
1617 */
afd46503 1618 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
673d57e7 1619 &saddr, ntohs(tcp_hdr(skb)->source));
7cd04fa7 1620 goto drop_and_release;
1da177e4
LT
1621 }
1622
a94f723d 1623 isn = tcp_v4_init_sequence(skb);
1da177e4 1624 }
2e6599cb 1625 tcp_rsk(req)->snt_isn = isn;
1da177e4 1626
168a8f58
JC
1627 if (dst == NULL) {
1628 dst = inet_csk_route_req(sk, &fl4, req);
1629 if (dst == NULL)
1630 goto drop_and_free;
1631 }
1632 do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1633
1634 /* We don't call tcp_v4_send_synack() directly because we need
1635 * to make sure a child socket can be created successfully before
1636 * sending back synack!
1637 *
1638 * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1639 * (or better yet, call tcp_send_synack() in the child context
1640 * directly, but will have to fix bunch of other code first)
1641 * after syn_recv_sock() except one will need to first fix the
1642 * latter to remove its dependency on the current implementation
1643 * of tcp_v4_send_synack()->tcp_select_initial_window().
1644 */
1645 skb_synack = tcp_make_synack(sk, dst, req,
1646 (struct request_values *)&tmp_ext,
1647 fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1648
1649 if (skb_synack) {
1650 __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1651 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1652 } else
1653 goto drop_and_free;
1654
1655 if (likely(!do_fastopen)) {
1656 int err;
1657 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1658 ireq->rmt_addr, ireq->opt);
1659 err = net_xmit_eval(err);
1660 if (err || want_cookie)
1661 goto drop_and_free;
1662
016818d0 1663 tcp_rsk(req)->snt_synack = tcp_time_stamp;
168a8f58
JC
1664 tcp_rsk(req)->listener = NULL;
1665 /* Add the request_sock to the SYN table */
1666 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1667 if (fastopen_cookie_present(&foc) && foc.len != 0)
1668 NET_INC_STATS_BH(sock_net(sk),
1669 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1670 } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
1671 (struct request_values *)&tmp_ext))
1da177e4
LT
1672 goto drop_and_free;
1673
1da177e4
LT
1674 return 0;
1675
7cd04fa7
DL
1676drop_and_release:
1677 dst_release(dst);
1da177e4 1678drop_and_free:
60236fdd 1679 reqsk_free(req);
1da177e4 1680drop:
1da177e4
LT
1681 return 0;
1682}
4bc2f18b 1683EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1684
1685
1686/*
1687 * The three way handshake has completed - we got a valid synack -
1688 * now create the new socket.
1689 */
1690struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
60236fdd 1691 struct request_sock *req,
1da177e4
LT
1692 struct dst_entry *dst)
1693{
2e6599cb 1694 struct inet_request_sock *ireq;
1da177e4
LT
1695 struct inet_sock *newinet;
1696 struct tcp_sock *newtp;
1697 struct sock *newsk;
cfb6eeb4
YH
1698#ifdef CONFIG_TCP_MD5SIG
1699 struct tcp_md5sig_key *key;
1700#endif
f6d8bd05 1701 struct ip_options_rcu *inet_opt;
1da177e4
LT
1702
1703 if (sk_acceptq_is_full(sk))
1704 goto exit_overflow;
1705
1da177e4
LT
1706 newsk = tcp_create_openreq_child(sk, req, skb);
1707 if (!newsk)
093d2823 1708 goto exit_nonewsk;
1da177e4 1709
bcd76111 1710 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1711 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1712
1713 newtp = tcp_sk(newsk);
1714 newinet = inet_sk(newsk);
2e6599cb 1715 ireq = inet_rsk(req);
c720c7e8
ED
1716 newinet->inet_daddr = ireq->rmt_addr;
1717 newinet->inet_rcv_saddr = ireq->loc_addr;
1718 newinet->inet_saddr = ireq->loc_addr;
f6d8bd05
ED
1719 inet_opt = ireq->opt;
1720 rcu_assign_pointer(newinet->inet_opt, inet_opt);
2e6599cb 1721 ireq->opt = NULL;
463c84b9 1722 newinet->mc_index = inet_iif(skb);
eddc9ec5 1723 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1724 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1725 inet_csk(newsk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
1726 if (inet_opt)
1727 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
c720c7e8 1728 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1729
dfd25fff
ED
1730 if (!dst) {
1731 dst = inet_csk_route_child_sock(sk, newsk, req);
1732 if (!dst)
1733 goto put_and_exit;
1734 } else {
1735 /* syncookie case : see end of cookie_v4_check() */
1736 }
0e734419
DM
1737 sk_setup_caps(newsk, dst);
1738
5d424d5a 1739 tcp_mtup_init(newsk);
1da177e4 1740 tcp_sync_mss(newsk, dst_mtu(dst));
0dbaee3b 1741 newtp->advmss = dst_metric_advmss(dst);
f5fff5dc
TQ
1742 if (tcp_sk(sk)->rx_opt.user_mss &&
1743 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1744 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1745
1da177e4 1746 tcp_initialize_rcv_mss(newsk);
623df484 1747 tcp_synack_rtt_meas(newsk, req);
e6c022a4 1748 newtp->total_retrans = req->num_retrans;
1da177e4 1749
cfb6eeb4
YH
1750#ifdef CONFIG_TCP_MD5SIG
1751 /* Copy over the MD5 key from the original socket */
a915da9b
ED
1752 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1753 AF_INET);
c720c7e8 1754 if (key != NULL) {
cfb6eeb4
YH
1755 /*
1756 * We're using one, so create a matching key
1757 * on the newsk structure. If we fail to get
1758 * memory, then we end up not copying the key
1759 * across. Shucks.
1760 */
a915da9b
ED
1761 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1762 AF_INET, key->key, key->keylen, GFP_ATOMIC);
a465419b 1763 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1764 }
1765#endif
1766
0e734419
DM
1767 if (__inet_inherit_port(sk, newsk) < 0)
1768 goto put_and_exit;
9327f705 1769 __inet_hash_nolisten(newsk, NULL);
1da177e4
LT
1770
1771 return newsk;
1772
1773exit_overflow:
de0744af 1774 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1775exit_nonewsk:
1776 dst_release(dst);
1da177e4 1777exit:
de0744af 1778 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4 1779 return NULL;
0e734419 1780put_and_exit:
709e8697 1781 tcp_clear_xmit_timers(newsk);
d8a6e65f 1782 tcp_cleanup_congestion_control(newsk);
918eb399 1783 bh_unlock_sock(newsk);
0e734419
DM
1784 sock_put(newsk);
1785 goto exit;
1da177e4 1786}
4bc2f18b 1787EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4
LT
1788
1789static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1790{
aa8223c7 1791 struct tcphdr *th = tcp_hdr(skb);
eddc9ec5 1792 const struct iphdr *iph = ip_hdr(skb);
1da177e4 1793 struct sock *nsk;
60236fdd 1794 struct request_sock **prev;
1da177e4 1795 /* Find possible connection requests. */
463c84b9
ACM
1796 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1797 iph->saddr, iph->daddr);
1da177e4 1798 if (req)
8336886f 1799 return tcp_check_req(sk, skb, req, prev, false);
1da177e4 1800
3b1e0a65 1801 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
c67499c0 1802 th->source, iph->daddr, th->dest, inet_iif(skb));
1da177e4
LT
1803
1804 if (nsk) {
1805 if (nsk->sk_state != TCP_TIME_WAIT) {
1806 bh_lock_sock(nsk);
1807 return nsk;
1808 }
9469c7b4 1809 inet_twsk_put(inet_twsk(nsk));
1da177e4
LT
1810 return NULL;
1811 }
1812
1813#ifdef CONFIG_SYN_COOKIES
af9b4738 1814 if (!th->syn)
1da177e4
LT
1815 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1816#endif
1817 return sk;
1818}
1819
b51655b9 1820static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1da177e4 1821{
eddc9ec5
ACM
1822 const struct iphdr *iph = ip_hdr(skb);
1823
84fa7933 1824 if (skb->ip_summed == CHECKSUM_COMPLETE) {
eddc9ec5
ACM
1825 if (!tcp_v4_check(skb->len, iph->saddr,
1826 iph->daddr, skb->csum)) {
fb286bb2 1827 skb->ip_summed = CHECKSUM_UNNECESSARY;
1da177e4 1828 return 0;
fb286bb2 1829 }
1da177e4 1830 }
fb286bb2 1831
eddc9ec5 1832 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
fb286bb2
HX
1833 skb->len, IPPROTO_TCP, 0);
1834
1da177e4 1835 if (skb->len <= 76) {
fb286bb2 1836 return __skb_checksum_complete(skb);
1da177e4
LT
1837 }
1838 return 0;
1839}
1840
1841
1842/* The socket must have it's spinlock held when we get
1843 * here.
1844 *
1845 * We have a potential double-lock case here, so even when
1846 * doing backlog processing we use the BH locking scheme.
1847 * This is because we cannot sleep with the original spinlock
1848 * held.
1849 */
1850int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1851{
cfb6eeb4
YH
1852 struct sock *rsk;
1853#ifdef CONFIG_TCP_MD5SIG
1854 /*
1855 * We really want to reject the packet as early as possible
1856 * if:
1857 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1858 * o There is an MD5 option and we're not expecting one
1859 */
7174259e 1860 if (tcp_v4_inbound_md5_hash(sk, skb))
cfb6eeb4
YH
1861 goto discard;
1862#endif
1863
1da177e4 1864 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1865 struct dst_entry *dst = sk->sk_rx_dst;
1866
bdeab991 1867 sock_rps_save_rxhash(sk, skb);
404e0a8b 1868 if (dst) {
505fbcf0
ED
1869 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1870 dst->ops->check(dst, 0) == NULL) {
92101b3b
DM
1871 dst_release(dst);
1872 sk->sk_rx_dst = NULL;
1873 }
1874 }
aa8223c7 1875 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1876 rsk = sk;
1da177e4 1877 goto reset;
cfb6eeb4 1878 }
1da177e4
LT
1879 return 0;
1880 }
1881
ab6a5bb6 1882 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1da177e4
LT
1883 goto csum_err;
1884
1885 if (sk->sk_state == TCP_LISTEN) {
1886 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1887 if (!nsk)
1888 goto discard;
1889
1890 if (nsk != sk) {
bdeab991 1891 sock_rps_save_rxhash(nsk, skb);
cfb6eeb4
YH
1892 if (tcp_child_process(sk, nsk, skb)) {
1893 rsk = nsk;
1da177e4 1894 goto reset;
cfb6eeb4 1895 }
1da177e4
LT
1896 return 0;
1897 }
ca55158c 1898 } else
bdeab991 1899 sock_rps_save_rxhash(sk, skb);
ca55158c 1900
aa8223c7 1901 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1902 rsk = sk;
1da177e4 1903 goto reset;
cfb6eeb4 1904 }
1da177e4
LT
1905 return 0;
1906
1907reset:
cfb6eeb4 1908 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1909discard:
1910 kfree_skb(skb);
1911 /* Be careful here. If this function gets more complicated and
1912 * gcc suffers from register pressure on the x86, sk (in %ebx)
1913 * might be destroyed here. This current version compiles correctly,
1914 * but you have been warned.
1915 */
1916 return 0;
1917
1918csum_err:
63231bdd 1919 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1920 goto discard;
1921}
4bc2f18b 1922EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1923
160eb5a6 1924void tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1925{
41063e9d
DM
1926 const struct iphdr *iph;
1927 const struct tcphdr *th;
1928 struct sock *sk;
41063e9d 1929
41063e9d 1930 if (skb->pkt_type != PACKET_HOST)
160eb5a6 1931 return;
41063e9d 1932
45f00f99 1933 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
160eb5a6 1934 return;
41063e9d
DM
1935
1936 iph = ip_hdr(skb);
45f00f99 1937 th = tcp_hdr(skb);
41063e9d
DM
1938
1939 if (th->doff < sizeof(struct tcphdr) / 4)
160eb5a6 1940 return;
41063e9d 1941
45f00f99 1942 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1943 iph->saddr, th->source,
7011d085 1944 iph->daddr, ntohs(th->dest),
9cb429d6 1945 skb->skb_iif);
41063e9d
DM
1946 if (sk) {
1947 skb->sk = sk;
1948 skb->destructor = sock_edemux;
1949 if (sk->sk_state != TCP_TIME_WAIT) {
1950 struct dst_entry *dst = sk->sk_rx_dst;
505fbcf0 1951
41063e9d
DM
1952 if (dst)
1953 dst = dst_check(dst, 0);
92101b3b 1954 if (dst &&
505fbcf0 1955 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1956 skb_dst_set_noref(skb, dst);
41063e9d
DM
1957 }
1958 }
41063e9d
DM
1959}
1960
1da177e4
LT
1961/*
1962 * From tcp_input.c
1963 */
1964
1965int tcp_v4_rcv(struct sk_buff *skb)
1966{
eddc9ec5 1967 const struct iphdr *iph;
cf533ea5 1968 const struct tcphdr *th;
1da177e4
LT
1969 struct sock *sk;
1970 int ret;
a86b1e30 1971 struct net *net = dev_net(skb->dev);
1da177e4
LT
1972
1973 if (skb->pkt_type != PACKET_HOST)
1974 goto discard_it;
1975
1976 /* Count it even if it's bad */
63231bdd 1977 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1da177e4
LT
1978
1979 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1980 goto discard_it;
1981
aa8223c7 1982 th = tcp_hdr(skb);
1da177e4
LT
1983
1984 if (th->doff < sizeof(struct tcphdr) / 4)
1985 goto bad_packet;
1986 if (!pskb_may_pull(skb, th->doff * 4))
1987 goto discard_it;
1988
1989 /* An explanation is required here, I think.
1990 * Packet length and doff are validated by header prediction,
caa20d9a 1991 * provided case of th->doff==0 is eliminated.
1da177e4 1992 * So, we defer the checks. */
60476372 1993 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1da177e4
LT
1994 goto bad_packet;
1995
aa8223c7 1996 th = tcp_hdr(skb);
eddc9ec5 1997 iph = ip_hdr(skb);
1da177e4
LT
1998 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1999 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2000 skb->len - th->doff * 4);
2001 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2002 TCP_SKB_CB(skb)->when = 0;
b82d1bb4 2003 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1da177e4
LT
2004 TCP_SKB_CB(skb)->sacked = 0;
2005
9a1f27c4 2006 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1da177e4
LT
2007 if (!sk)
2008 goto no_tcp_socket;
2009
bb134d5d
ED
2010process:
2011 if (sk->sk_state == TCP_TIME_WAIT)
2012 goto do_time_wait;
2013
6cce09f8
ED
2014 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2015 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 2016 goto discard_and_relse;
6cce09f8 2017 }
d218d111 2018
1da177e4
LT
2019 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2020 goto discard_and_relse;
b59c2701 2021 nf_reset(skb);
1da177e4 2022
fda9ef5d 2023 if (sk_filter(sk, skb))
1da177e4
LT
2024 goto discard_and_relse;
2025
2026 skb->dev = NULL;
2027
c6366184 2028 bh_lock_sock_nested(sk);
1da177e4
LT
2029 ret = 0;
2030 if (!sock_owned_by_user(sk)) {
1a2449a8
CL
2031#ifdef CONFIG_NET_DMA
2032 struct tcp_sock *tp = tcp_sk(sk);
2033 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
a2bd1140 2034 tp->ucopy.dma_chan = net_dma_find_channel();
1a2449a8 2035 if (tp->ucopy.dma_chan)
1da177e4 2036 ret = tcp_v4_do_rcv(sk, skb);
1a2449a8
CL
2037 else
2038#endif
2039 {
2040 if (!tcp_prequeue(sk, skb))
ae8d7f88 2041 ret = tcp_v4_do_rcv(sk, skb);
1a2449a8 2042 }
da882c1f
ED
2043 } else if (unlikely(sk_add_backlog(sk, skb,
2044 sk->sk_rcvbuf + sk->sk_sndbuf))) {
6b03a53a 2045 bh_unlock_sock(sk);
6cce09f8 2046 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
6b03a53a
ZY
2047 goto discard_and_relse;
2048 }
1da177e4
LT
2049 bh_unlock_sock(sk);
2050
2051 sock_put(sk);
2052
2053 return ret;
2054
2055no_tcp_socket:
2056 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2057 goto discard_it;
2058
2059 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2060bad_packet:
63231bdd 2061 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1da177e4 2062 } else {
cfb6eeb4 2063 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
2064 }
2065
2066discard_it:
2067 /* Discard frame. */
2068 kfree_skb(skb);
e905a9ed 2069 return 0;
1da177e4
LT
2070
2071discard_and_relse:
2072 sock_put(sk);
2073 goto discard_it;
2074
2075do_time_wait:
2076 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 2077 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
2078 goto discard_it;
2079 }
2080
2081 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
63231bdd 2082 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
9469c7b4 2083 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
2084 goto discard_it;
2085 }
9469c7b4 2086 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 2087 case TCP_TW_SYN: {
c346dca1 2088 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
c67499c0 2089 &tcp_hashinfo,
eddc9ec5 2090 iph->daddr, th->dest,
463c84b9 2091 inet_iif(skb));
1da177e4 2092 if (sk2) {
9469c7b4
YH
2093 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
2094 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
2095 sk = sk2;
2096 goto process;
2097 }
2098 /* Fall through to ACK */
2099 }
2100 case TCP_TW_ACK:
2101 tcp_v4_timewait_ack(sk, skb);
2102 break;
2103 case TCP_TW_RST:
2104 goto no_tcp_socket;
2105 case TCP_TW_SUCCESS:;
2106 }
2107 goto discard_it;
2108}
2109
ccb7c410
DM
2110static struct timewait_sock_ops tcp_timewait_sock_ops = {
2111 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2112 .twsk_unique = tcp_twsk_unique,
2113 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 2114};
1da177e4 2115
63d02d15 2116void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
2117{
2118 struct dst_entry *dst = skb_dst(skb);
2119
2120 dst_hold(dst);
2121 sk->sk_rx_dst = dst;
2122 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2123}
63d02d15 2124EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 2125
3b401a81 2126const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
2127 .queue_xmit = ip_queue_xmit,
2128 .send_check = tcp_v4_send_check,
2129 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 2130 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
2131 .conn_request = tcp_v4_conn_request,
2132 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
2133 .net_header_len = sizeof(struct iphdr),
2134 .setsockopt = ip_setsockopt,
2135 .getsockopt = ip_getsockopt,
2136 .addr2sockaddr = inet_csk_addr2sockaddr,
2137 .sockaddr_len = sizeof(struct sockaddr_in),
ab1e0a13 2138 .bind_conflict = inet_csk_bind_conflict,
3fdadf7d 2139#ifdef CONFIG_COMPAT
543d9cfe
ACM
2140 .compat_setsockopt = compat_ip_setsockopt,
2141 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 2142#endif
1da177e4 2143};
4bc2f18b 2144EXPORT_SYMBOL(ipv4_specific);
1da177e4 2145
cfb6eeb4 2146#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 2147static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 2148 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 2149 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 2150 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 2151};
b6332e6c 2152#endif
cfb6eeb4 2153
1da177e4
LT
2154/* NOTE: A lot of things set to zero explicitly by call to
2155 * sk_alloc() so need not be done here.
2156 */
2157static int tcp_v4_init_sock(struct sock *sk)
2158{
6687e988 2159 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 2160
900f65d3 2161 tcp_init_sock(sk);
1da177e4 2162
8292a17a 2163 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 2164
cfb6eeb4 2165#ifdef CONFIG_TCP_MD5SIG
ac807fa8 2166 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 2167#endif
1da177e4 2168
1da177e4
LT
2169 return 0;
2170}
2171
7d06b2e0 2172void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
2173{
2174 struct tcp_sock *tp = tcp_sk(sk);
2175
2176 tcp_clear_xmit_timers(sk);
2177
6687e988 2178 tcp_cleanup_congestion_control(sk);
317a76f9 2179
1da177e4 2180 /* Cleanup up the write buffer. */
fe067e8a 2181 tcp_write_queue_purge(sk);
1da177e4
LT
2182
2183 /* Cleans up our, hopefully empty, out_of_order_queue. */
e905a9ed 2184 __skb_queue_purge(&tp->out_of_order_queue);
1da177e4 2185
cfb6eeb4
YH
2186#ifdef CONFIG_TCP_MD5SIG
2187 /* Clean up the MD5 key list, if any */
2188 if (tp->md5sig_info) {
a915da9b 2189 tcp_clear_md5_list(sk);
a8afca03 2190 kfree_rcu(tp->md5sig_info, rcu);
cfb6eeb4
YH
2191 tp->md5sig_info = NULL;
2192 }
2193#endif
2194
1a2449a8
CL
2195#ifdef CONFIG_NET_DMA
2196 /* Cleans up our sk_async_wait_queue */
e905a9ed 2197 __skb_queue_purge(&sk->sk_async_wait_queue);
1a2449a8
CL
2198#endif
2199
1da177e4
LT
2200 /* Clean prequeue, it must be empty really */
2201 __skb_queue_purge(&tp->ucopy.prequeue);
2202
2203 /* Clean up a referenced TCP bind bucket. */
463c84b9 2204 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 2205 inet_put_port(sk);
1da177e4 2206
435cf559
WAS
2207 /* TCP Cookie Transactions */
2208 if (tp->cookie_values != NULL) {
2209 kref_put(&tp->cookie_values->kref,
2210 tcp_cookie_values_release);
2211 tp->cookie_values = NULL;
2212 }
168a8f58 2213 BUG_ON(tp->fastopen_rsk != NULL);
435cf559 2214
cf60af03
YC
2215 /* If socket is aborted during connect operation */
2216 tcp_free_fastopen_req(tp);
2217
180d8cd9 2218 sk_sockets_allocated_dec(sk);
d1a4c0b3 2219 sock_release_memcg(sk);
1da177e4 2220}
1da177e4
LT
2221EXPORT_SYMBOL(tcp_v4_destroy_sock);
2222
2223#ifdef CONFIG_PROC_FS
2224/* Proc filesystem TCP sock list dumping. */
2225
3ab5aee7 2226static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1da177e4 2227{
3ab5aee7 2228 return hlist_nulls_empty(head) ? NULL :
8feaf0c0 2229 list_entry(head->first, struct inet_timewait_sock, tw_node);
1da177e4
LT
2230}
2231
8feaf0c0 2232static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1da177e4 2233{
3ab5aee7
ED
2234 return !is_a_nulls(tw->tw_node.next) ?
2235 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1da177e4
LT
2236}
2237
a8b690f9
TH
2238/*
2239 * Get next listener socket follow cur. If cur is NULL, get first socket
2240 * starting from bucket given in st->bucket; when st->bucket is zero the
2241 * very first socket in the hash table is returned.
2242 */
1da177e4
LT
2243static void *listening_get_next(struct seq_file *seq, void *cur)
2244{
463c84b9 2245 struct inet_connection_sock *icsk;
c25eb3bf 2246 struct hlist_nulls_node *node;
1da177e4 2247 struct sock *sk = cur;
5caea4ea 2248 struct inet_listen_hashbucket *ilb;
5799de0b 2249 struct tcp_iter_state *st = seq->private;
a4146b1b 2250 struct net *net = seq_file_net(seq);
1da177e4
LT
2251
2252 if (!sk) {
a8b690f9 2253 ilb = &tcp_hashinfo.listening_hash[st->bucket];
5caea4ea 2254 spin_lock_bh(&ilb->lock);
c25eb3bf 2255 sk = sk_nulls_head(&ilb->head);
a8b690f9 2256 st->offset = 0;
1da177e4
LT
2257 goto get_sk;
2258 }
5caea4ea 2259 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 2260 ++st->num;
a8b690f9 2261 ++st->offset;
1da177e4
LT
2262
2263 if (st->state == TCP_SEQ_STATE_OPENREQ) {
60236fdd 2264 struct request_sock *req = cur;
1da177e4 2265
72a3effa 2266 icsk = inet_csk(st->syn_wait_sk);
1da177e4
LT
2267 req = req->dl_next;
2268 while (1) {
2269 while (req) {
bdccc4ca 2270 if (req->rsk_ops->family == st->family) {
1da177e4
LT
2271 cur = req;
2272 goto out;
2273 }
2274 req = req->dl_next;
2275 }
72a3effa 2276 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1da177e4
LT
2277 break;
2278get_req:
463c84b9 2279 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1da177e4 2280 }
1bde5ac4 2281 sk = sk_nulls_next(st->syn_wait_sk);
1da177e4 2282 st->state = TCP_SEQ_STATE_LISTENING;
463c84b9 2283 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 2284 } else {
e905a9ed 2285 icsk = inet_csk(sk);
463c84b9
ACM
2286 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2287 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1da177e4 2288 goto start_req;
463c84b9 2289 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1bde5ac4 2290 sk = sk_nulls_next(sk);
1da177e4
LT
2291 }
2292get_sk:
c25eb3bf 2293 sk_nulls_for_each_from(sk, node) {
8475ef9f
PE
2294 if (!net_eq(sock_net(sk), net))
2295 continue;
2296 if (sk->sk_family == st->family) {
1da177e4
LT
2297 cur = sk;
2298 goto out;
2299 }
e905a9ed 2300 icsk = inet_csk(sk);
463c84b9
ACM
2301 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2302 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1da177e4
LT
2303start_req:
2304 st->uid = sock_i_uid(sk);
2305 st->syn_wait_sk = sk;
2306 st->state = TCP_SEQ_STATE_OPENREQ;
2307 st->sbucket = 0;
2308 goto get_req;
2309 }
463c84b9 2310 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 2311 }
5caea4ea 2312 spin_unlock_bh(&ilb->lock);
a8b690f9 2313 st->offset = 0;
0f7ff927 2314 if (++st->bucket < INET_LHTABLE_SIZE) {
5caea4ea
ED
2315 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2316 spin_lock_bh(&ilb->lock);
c25eb3bf 2317 sk = sk_nulls_head(&ilb->head);
1da177e4
LT
2318 goto get_sk;
2319 }
2320 cur = NULL;
2321out:
2322 return cur;
2323}
2324
2325static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2326{
a8b690f9
TH
2327 struct tcp_iter_state *st = seq->private;
2328 void *rc;
2329
2330 st->bucket = 0;
2331 st->offset = 0;
2332 rc = listening_get_next(seq, NULL);
1da177e4
LT
2333
2334 while (rc && *pos) {
2335 rc = listening_get_next(seq, rc);
2336 --*pos;
2337 }
2338 return rc;
2339}
2340
a2a385d6 2341static inline bool empty_bucket(struct tcp_iter_state *st)
6eac5604 2342{
3ab5aee7
ED
2343 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2344 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
6eac5604
AK
2345}
2346
a8b690f9
TH
2347/*
2348 * Get first established socket starting from bucket given in st->bucket.
2349 * If st->bucket is zero, the very first socket in the hash is returned.
2350 */
1da177e4
LT
2351static void *established_get_first(struct seq_file *seq)
2352{
5799de0b 2353 struct tcp_iter_state *st = seq->private;
a4146b1b 2354 struct net *net = seq_file_net(seq);
1da177e4
LT
2355 void *rc = NULL;
2356
a8b690f9
TH
2357 st->offset = 0;
2358 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 2359 struct sock *sk;
3ab5aee7 2360 struct hlist_nulls_node *node;
8feaf0c0 2361 struct inet_timewait_sock *tw;
9db66bdc 2362 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 2363
6eac5604
AK
2364 /* Lockless fast path for the common case of empty buckets */
2365 if (empty_bucket(st))
2366 continue;
2367
9db66bdc 2368 spin_lock_bh(lock);
3ab5aee7 2369 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
f40c8174 2370 if (sk->sk_family != st->family ||
878628fb 2371 !net_eq(sock_net(sk), net)) {
1da177e4
LT
2372 continue;
2373 }
2374 rc = sk;
2375 goto out;
2376 }
2377 st->state = TCP_SEQ_STATE_TIME_WAIT;
8feaf0c0 2378 inet_twsk_for_each(tw, node,
dbca9b27 2379 &tcp_hashinfo.ehash[st->bucket].twchain) {
28518fc1 2380 if (tw->tw_family != st->family ||
878628fb 2381 !net_eq(twsk_net(tw), net)) {
1da177e4
LT
2382 continue;
2383 }
2384 rc = tw;
2385 goto out;
2386 }
9db66bdc 2387 spin_unlock_bh(lock);
1da177e4
LT
2388 st->state = TCP_SEQ_STATE_ESTABLISHED;
2389 }
2390out:
2391 return rc;
2392}
2393
2394static void *established_get_next(struct seq_file *seq, void *cur)
2395{
2396 struct sock *sk = cur;
8feaf0c0 2397 struct inet_timewait_sock *tw;
3ab5aee7 2398 struct hlist_nulls_node *node;
5799de0b 2399 struct tcp_iter_state *st = seq->private;
a4146b1b 2400 struct net *net = seq_file_net(seq);
1da177e4
LT
2401
2402 ++st->num;
a8b690f9 2403 ++st->offset;
1da177e4
LT
2404
2405 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2406 tw = cur;
2407 tw = tw_next(tw);
2408get_tw:
878628fb 2409 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
1da177e4
LT
2410 tw = tw_next(tw);
2411 }
2412 if (tw) {
2413 cur = tw;
2414 goto out;
2415 }
9db66bdc 2416 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2417 st->state = TCP_SEQ_STATE_ESTABLISHED;
2418
6eac5604 2419 /* Look for next non empty bucket */
a8b690f9 2420 st->offset = 0;
f373b53b 2421 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
6eac5604
AK
2422 empty_bucket(st))
2423 ;
f373b53b 2424 if (st->bucket > tcp_hashinfo.ehash_mask)
6eac5604
AK
2425 return NULL;
2426
9db66bdc 2427 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
3ab5aee7 2428 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
1da177e4 2429 } else
3ab5aee7 2430 sk = sk_nulls_next(sk);
1da177e4 2431
3ab5aee7 2432 sk_nulls_for_each_from(sk, node) {
878628fb 2433 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1da177e4
LT
2434 goto found;
2435 }
2436
2437 st->state = TCP_SEQ_STATE_TIME_WAIT;
dbca9b27 2438 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
1da177e4
LT
2439 goto get_tw;
2440found:
2441 cur = sk;
2442out:
2443 return cur;
2444}
2445
2446static void *established_get_idx(struct seq_file *seq, loff_t pos)
2447{
a8b690f9
TH
2448 struct tcp_iter_state *st = seq->private;
2449 void *rc;
2450
2451 st->bucket = 0;
2452 rc = established_get_first(seq);
1da177e4
LT
2453
2454 while (rc && pos) {
2455 rc = established_get_next(seq, rc);
2456 --pos;
7174259e 2457 }
1da177e4
LT
2458 return rc;
2459}
2460
2461static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2462{
2463 void *rc;
5799de0b 2464 struct tcp_iter_state *st = seq->private;
1da177e4 2465
1da177e4
LT
2466 st->state = TCP_SEQ_STATE_LISTENING;
2467 rc = listening_get_idx(seq, &pos);
2468
2469 if (!rc) {
1da177e4
LT
2470 st->state = TCP_SEQ_STATE_ESTABLISHED;
2471 rc = established_get_idx(seq, pos);
2472 }
2473
2474 return rc;
2475}
2476
a8b690f9
TH
2477static void *tcp_seek_last_pos(struct seq_file *seq)
2478{
2479 struct tcp_iter_state *st = seq->private;
2480 int offset = st->offset;
2481 int orig_num = st->num;
2482 void *rc = NULL;
2483
2484 switch (st->state) {
2485 case TCP_SEQ_STATE_OPENREQ:
2486 case TCP_SEQ_STATE_LISTENING:
2487 if (st->bucket >= INET_LHTABLE_SIZE)
2488 break;
2489 st->state = TCP_SEQ_STATE_LISTENING;
2490 rc = listening_get_next(seq, NULL);
2491 while (offset-- && rc)
2492 rc = listening_get_next(seq, rc);
2493 if (rc)
2494 break;
2495 st->bucket = 0;
2496 /* Fallthrough */
2497 case TCP_SEQ_STATE_ESTABLISHED:
2498 case TCP_SEQ_STATE_TIME_WAIT:
2499 st->state = TCP_SEQ_STATE_ESTABLISHED;
2500 if (st->bucket > tcp_hashinfo.ehash_mask)
2501 break;
2502 rc = established_get_first(seq);
2503 while (offset-- && rc)
2504 rc = established_get_next(seq, rc);
2505 }
2506
2507 st->num = orig_num;
2508
2509 return rc;
2510}
2511
1da177e4
LT
2512static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2513{
5799de0b 2514 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2515 void *rc;
2516
2517 if (*pos && *pos == st->last_pos) {
2518 rc = tcp_seek_last_pos(seq);
2519 if (rc)
2520 goto out;
2521 }
2522
1da177e4
LT
2523 st->state = TCP_SEQ_STATE_LISTENING;
2524 st->num = 0;
a8b690f9
TH
2525 st->bucket = 0;
2526 st->offset = 0;
2527 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2528
2529out:
2530 st->last_pos = *pos;
2531 return rc;
1da177e4
LT
2532}
2533
2534static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2535{
a8b690f9 2536 struct tcp_iter_state *st = seq->private;
1da177e4 2537 void *rc = NULL;
1da177e4
LT
2538
2539 if (v == SEQ_START_TOKEN) {
2540 rc = tcp_get_idx(seq, 0);
2541 goto out;
2542 }
1da177e4
LT
2543
2544 switch (st->state) {
2545 case TCP_SEQ_STATE_OPENREQ:
2546 case TCP_SEQ_STATE_LISTENING:
2547 rc = listening_get_next(seq, v);
2548 if (!rc) {
1da177e4 2549 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2550 st->bucket = 0;
2551 st->offset = 0;
1da177e4
LT
2552 rc = established_get_first(seq);
2553 }
2554 break;
2555 case TCP_SEQ_STATE_ESTABLISHED:
2556 case TCP_SEQ_STATE_TIME_WAIT:
2557 rc = established_get_next(seq, v);
2558 break;
2559 }
2560out:
2561 ++*pos;
a8b690f9 2562 st->last_pos = *pos;
1da177e4
LT
2563 return rc;
2564}
2565
2566static void tcp_seq_stop(struct seq_file *seq, void *v)
2567{
5799de0b 2568 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2569
2570 switch (st->state) {
2571 case TCP_SEQ_STATE_OPENREQ:
2572 if (v) {
463c84b9
ACM
2573 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2574 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4
LT
2575 }
2576 case TCP_SEQ_STATE_LISTENING:
2577 if (v != SEQ_START_TOKEN)
5caea4ea 2578 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4
LT
2579 break;
2580 case TCP_SEQ_STATE_TIME_WAIT:
2581 case TCP_SEQ_STATE_ESTABLISHED:
2582 if (v)
9db66bdc 2583 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2584 break;
2585 }
2586}
2587
73cb88ec 2588int tcp_seq_open(struct inode *inode, struct file *file)
1da177e4
LT
2589{
2590 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1da177e4 2591 struct tcp_iter_state *s;
52d6f3f1 2592 int err;
1da177e4 2593
52d6f3f1
DL
2594 err = seq_open_net(inode, file, &afinfo->seq_ops,
2595 sizeof(struct tcp_iter_state));
2596 if (err < 0)
2597 return err;
f40c8174 2598
52d6f3f1 2599 s = ((struct seq_file *)file->private_data)->private;
1da177e4 2600 s->family = afinfo->family;
a8b690f9 2601 s->last_pos = 0;
f40c8174
DL
2602 return 0;
2603}
73cb88ec 2604EXPORT_SYMBOL(tcp_seq_open);
f40c8174 2605
6f8b13bc 2606int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4
LT
2607{
2608 int rc = 0;
2609 struct proc_dir_entry *p;
2610
9427c4b3
DL
2611 afinfo->seq_ops.start = tcp_seq_start;
2612 afinfo->seq_ops.next = tcp_seq_next;
2613 afinfo->seq_ops.stop = tcp_seq_stop;
2614
84841c3c 2615 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
73cb88ec 2616 afinfo->seq_fops, afinfo);
84841c3c 2617 if (!p)
1da177e4
LT
2618 rc = -ENOMEM;
2619 return rc;
2620}
4bc2f18b 2621EXPORT_SYMBOL(tcp_proc_register);
1da177e4 2622
6f8b13bc 2623void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4 2624{
6f8b13bc 2625 proc_net_remove(net, afinfo->name);
1da177e4 2626}
4bc2f18b 2627EXPORT_SYMBOL(tcp_proc_unregister);
1da177e4 2628
cf533ea5 2629static void get_openreq4(const struct sock *sk, const struct request_sock *req,
a7cb5a49 2630 struct seq_file *f, int i, kuid_t uid, int *len)
1da177e4 2631{
2e6599cb 2632 const struct inet_request_sock *ireq = inet_rsk(req);
a399a805 2633 long delta = req->expires - jiffies;
1da177e4 2634
5e659e4c 2635 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
71338aa7 2636 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
1da177e4 2637 i,
2e6599cb 2638 ireq->loc_addr,
c720c7e8 2639 ntohs(inet_sk(sk)->inet_sport),
2e6599cb
ACM
2640 ireq->rmt_addr,
2641 ntohs(ireq->rmt_port),
1da177e4
LT
2642 TCP_SYN_RECV,
2643 0, 0, /* could print option size, but that is af dependent. */
2644 1, /* timers active (only the expire timer) */
a399a805 2645 jiffies_delta_to_clock_t(delta),
e6c022a4 2646 req->num_timeout,
a7cb5a49 2647 from_kuid_munged(seq_user_ns(f), uid),
1da177e4
LT
2648 0, /* non standard timer */
2649 0, /* open_requests have no inode */
2650 atomic_read(&sk->sk_refcnt),
5e659e4c
PE
2651 req,
2652 len);
1da177e4
LT
2653}
2654
5e659e4c 2655static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
1da177e4
LT
2656{
2657 int timer_active;
2658 unsigned long timer_expires;
cf533ea5 2659 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2660 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2661 const struct inet_sock *inet = inet_sk(sk);
168a8f58 2662 struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2663 __be32 dest = inet->inet_daddr;
2664 __be32 src = inet->inet_rcv_saddr;
2665 __u16 destp = ntohs(inet->inet_dport);
2666 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2667 int rx_queue;
1da177e4 2668
463c84b9 2669 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1da177e4 2670 timer_active = 1;
463c84b9
ACM
2671 timer_expires = icsk->icsk_timeout;
2672 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2673 timer_active = 4;
463c84b9 2674 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2675 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2676 timer_active = 2;
cf4c6bf8 2677 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2678 } else {
2679 timer_active = 0;
2680 timer_expires = jiffies;
2681 }
2682
49d09007
ED
2683 if (sk->sk_state == TCP_LISTEN)
2684 rx_queue = sk->sk_ack_backlog;
2685 else
2686 /*
2687 * because we dont lock socket, we might find a transient negative value
2688 */
2689 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2690
5e659e4c 2691 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
71338aa7 2692 "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
cf4c6bf8 2693 i, src, srcp, dest, destp, sk->sk_state,
47da8ee6 2694 tp->write_seq - tp->snd_una,
49d09007 2695 rx_queue,
1da177e4 2696 timer_active,
a399a805 2697 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2698 icsk->icsk_retransmits,
a7cb5a49 2699 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2700 icsk->icsk_probes_out,
cf4c6bf8
IJ
2701 sock_i_ino(sk),
2702 atomic_read(&sk->sk_refcnt), sk,
7be87351
SH
2703 jiffies_to_clock_t(icsk->icsk_rto),
2704 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2705 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2706 tp->snd_cwnd,
168a8f58
JC
2707 sk->sk_state == TCP_LISTEN ?
2708 (fastopenq ? fastopenq->max_qlen : 0) :
2709 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
5e659e4c 2710 len);
1da177e4
LT
2711}
2712
cf533ea5 2713static void get_timewait4_sock(const struct inet_timewait_sock *tw,
5e659e4c 2714 struct seq_file *f, int i, int *len)
1da177e4 2715{
23f33c2d 2716 __be32 dest, src;
1da177e4 2717 __u16 destp, srcp;
a399a805 2718 long delta = tw->tw_ttd - jiffies;
1da177e4
LT
2719
2720 dest = tw->tw_daddr;
2721 src = tw->tw_rcv_saddr;
2722 destp = ntohs(tw->tw_dport);
2723 srcp = ntohs(tw->tw_sport);
2724
5e659e4c 2725 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
71338aa7 2726 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
1da177e4 2727 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2728 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
5e659e4c 2729 atomic_read(&tw->tw_refcnt), tw, len);
1da177e4
LT
2730}
2731
2732#define TMPSZ 150
2733
2734static int tcp4_seq_show(struct seq_file *seq, void *v)
2735{
5799de0b 2736 struct tcp_iter_state *st;
5e659e4c 2737 int len;
1da177e4
LT
2738
2739 if (v == SEQ_START_TOKEN) {
2740 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2741 " sl local_address rem_address st tx_queue "
2742 "rx_queue tr tm->when retrnsmt uid timeout "
2743 "inode");
2744 goto out;
2745 }
2746 st = seq->private;
2747
2748 switch (st->state) {
2749 case TCP_SEQ_STATE_LISTENING:
2750 case TCP_SEQ_STATE_ESTABLISHED:
5e659e4c 2751 get_tcp4_sock(v, seq, st->num, &len);
1da177e4
LT
2752 break;
2753 case TCP_SEQ_STATE_OPENREQ:
5e659e4c 2754 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
1da177e4
LT
2755 break;
2756 case TCP_SEQ_STATE_TIME_WAIT:
5e659e4c 2757 get_timewait4_sock(v, seq, st->num, &len);
1da177e4
LT
2758 break;
2759 }
5e659e4c 2760 seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
1da177e4
LT
2761out:
2762 return 0;
2763}
2764
73cb88ec
AV
2765static const struct file_operations tcp_afinfo_seq_fops = {
2766 .owner = THIS_MODULE,
2767 .open = tcp_seq_open,
2768 .read = seq_read,
2769 .llseek = seq_lseek,
2770 .release = seq_release_net
2771};
2772
1da177e4 2773static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4
LT
2774 .name = "tcp",
2775 .family = AF_INET,
73cb88ec 2776 .seq_fops = &tcp_afinfo_seq_fops,
9427c4b3
DL
2777 .seq_ops = {
2778 .show = tcp4_seq_show,
2779 },
1da177e4
LT
2780};
2781
2c8c1e72 2782static int __net_init tcp4_proc_init_net(struct net *net)
757764f6
PE
2783{
2784 return tcp_proc_register(net, &tcp4_seq_afinfo);
2785}
2786
2c8c1e72 2787static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6
PE
2788{
2789 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2790}
2791
2792static struct pernet_operations tcp4_net_ops = {
2793 .init = tcp4_proc_init_net,
2794 .exit = tcp4_proc_exit_net,
2795};
2796
1da177e4
LT
2797int __init tcp4_proc_init(void)
2798{
757764f6 2799 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2800}
2801
2802void tcp4_proc_exit(void)
2803{
757764f6 2804 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2805}
2806#endif /* CONFIG_PROC_FS */
2807
bf296b12
HX
2808struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2809{
b71d1d42 2810 const struct iphdr *iph = skb_gro_network_header(skb);
861b6501
ED
2811 __wsum wsum;
2812 __sum16 sum;
bf296b12
HX
2813
2814 switch (skb->ip_summed) {
2815 case CHECKSUM_COMPLETE:
86911732 2816 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
bf296b12
HX
2817 skb->csum)) {
2818 skb->ip_summed = CHECKSUM_UNNECESSARY;
2819 break;
2820 }
861b6501 2821flush:
bf296b12
HX
2822 NAPI_GRO_CB(skb)->flush = 1;
2823 return NULL;
861b6501
ED
2824
2825 case CHECKSUM_NONE:
2826 wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
2827 skb_gro_len(skb), IPPROTO_TCP, 0);
2828 sum = csum_fold(skb_checksum(skb,
2829 skb_gro_offset(skb),
2830 skb_gro_len(skb),
2831 wsum));
2832 if (sum)
2833 goto flush;
2834
2835 skb->ip_summed = CHECKSUM_UNNECESSARY;
2836 break;
bf296b12
HX
2837 }
2838
2839 return tcp_gro_receive(head, skb);
2840}
bf296b12
HX
2841
2842int tcp4_gro_complete(struct sk_buff *skb)
2843{
b71d1d42 2844 const struct iphdr *iph = ip_hdr(skb);
bf296b12
HX
2845 struct tcphdr *th = tcp_hdr(skb);
2846
2847 th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2848 iph->saddr, iph->daddr, 0);
2849 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2850
2851 return tcp_gro_complete(skb);
2852}
bf296b12 2853
1da177e4
LT
2854struct proto tcp_prot = {
2855 .name = "TCP",
2856 .owner = THIS_MODULE,
2857 .close = tcp_close,
2858 .connect = tcp_v4_connect,
2859 .disconnect = tcp_disconnect,
463c84b9 2860 .accept = inet_csk_accept,
1da177e4
LT
2861 .ioctl = tcp_ioctl,
2862 .init = tcp_v4_init_sock,
2863 .destroy = tcp_v4_destroy_sock,
2864 .shutdown = tcp_shutdown,
2865 .setsockopt = tcp_setsockopt,
2866 .getsockopt = tcp_getsockopt,
1da177e4 2867 .recvmsg = tcp_recvmsg,
7ba42910
CG
2868 .sendmsg = tcp_sendmsg,
2869 .sendpage = tcp_sendpage,
1da177e4 2870 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 2871 .release_cb = tcp_release_cb,
563d34d0 2872 .mtu_reduced = tcp_v4_mtu_reduced,
ab1e0a13
ACM
2873 .hash = inet_hash,
2874 .unhash = inet_unhash,
2875 .get_port = inet_csk_get_port,
1da177e4
LT
2876 .enter_memory_pressure = tcp_enter_memory_pressure,
2877 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2878 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2879 .memory_allocated = &tcp_memory_allocated,
2880 .memory_pressure = &tcp_memory_pressure,
1da177e4
LT
2881 .sysctl_wmem = sysctl_tcp_wmem,
2882 .sysctl_rmem = sysctl_tcp_rmem,
2883 .max_header = MAX_TCP_HEADER,
2884 .obj_size = sizeof(struct tcp_sock),
3ab5aee7 2885 .slab_flags = SLAB_DESTROY_BY_RCU,
6d6ee43e 2886 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2887 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2888 .h.hashinfo = &tcp_hashinfo,
7ba42910 2889 .no_autobind = true,
543d9cfe
ACM
2890#ifdef CONFIG_COMPAT
2891 .compat_setsockopt = compat_tcp_setsockopt,
2892 .compat_getsockopt = compat_tcp_getsockopt,
2893#endif
c255a458 2894#ifdef CONFIG_MEMCG_KMEM
d1a4c0b3
GC
2895 .init_cgroup = tcp_init_cgroup,
2896 .destroy_cgroup = tcp_destroy_cgroup,
2897 .proto_cgroup = tcp_proto_cgroup,
2898#endif
1da177e4 2899};
4bc2f18b 2900EXPORT_SYMBOL(tcp_prot);
1da177e4 2901
046ee902
DL
2902static int __net_init tcp_sk_init(struct net *net)
2903{
be9f4a44 2904 return 0;
046ee902
DL
2905}
2906
2907static void __net_exit tcp_sk_exit(struct net *net)
2908{
b099ce26
EB
2909}
2910
2911static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2912{
2913 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
046ee902
DL
2914}
2915
2916static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2917 .init = tcp_sk_init,
2918 .exit = tcp_sk_exit,
2919 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2920};
2921
9b0f976f 2922void __init tcp_v4_init(void)
1da177e4 2923{
5caea4ea 2924 inet_hashinfo_init(&tcp_hashinfo);
6a1b3054 2925 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2926 panic("Failed to create the TCP control socket.\n");
1da177e4 2927}