inet: remove some sk_listener dependencies
[linux-2.6-block.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
afd46503 53#define pr_fmt(fmt) "TCP: " fmt
1da177e4 54
eb4dea58 55#include <linux/bottom_half.h>
1da177e4
LT
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
5a0e3ad6 64#include <linux/slab.h>
1da177e4 65
457c4cbc 66#include <net/net_namespace.h>
1da177e4 67#include <net/icmp.h>
304a1618 68#include <net/inet_hashtables.h>
1da177e4 69#include <net/tcp.h>
20380731 70#include <net/transp_v6.h>
1da177e4
LT
71#include <net/ipv6.h>
72#include <net/inet_common.h>
6d6ee43e 73#include <net/timewait_sock.h>
1da177e4 74#include <net/xfrm.h>
6e5714ea 75#include <net/secure_seq.h>
d1a4c0b3 76#include <net/tcp_memcontrol.h>
076bb0c8 77#include <net/busy_poll.h>
1da177e4
LT
78
79#include <linux/inet.h>
80#include <linux/ipv6.h>
81#include <linux/stddef.h>
82#include <linux/proc_fs.h>
83#include <linux/seq_file.h>
84
cfb6eeb4
YH
85#include <linux/crypto.h>
86#include <linux/scatterlist.h>
87
ab32ea5d
BH
88int sysctl_tcp_tw_reuse __read_mostly;
89int sysctl_tcp_low_latency __read_mostly;
4bc2f18b 90EXPORT_SYMBOL(sysctl_tcp_low_latency);
1da177e4 91
cfb6eeb4 92#ifdef CONFIG_TCP_MD5SIG
a915da9b 93static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 94 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
95#endif
96
5caea4ea 97struct inet_hashinfo tcp_hashinfo;
4bc2f18b 98EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 99
936b8bdb 100static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
1da177e4 101{
eddc9ec5
ACM
102 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
103 ip_hdr(skb)->saddr,
aa8223c7
ACM
104 tcp_hdr(skb)->dest,
105 tcp_hdr(skb)->source);
1da177e4
LT
106}
107
6d6ee43e
ACM
108int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109{
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
112
113 /* With PAWS, it is safe from the viewpoint
114 of data integrity. Even without PAWS it is safe provided sequence
115 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
116
117 Actually, the idea is close to VJ's one, only timestamp cache is
118 held not per host, but per port pair and TW bucket is used as state
119 holder.
120
121 If TW bucket has been already destroyed we fall back to VJ's scheme
122 and use initial timestamp retrieved from peer table.
123 */
124 if (tcptw->tw_ts_recent_stamp &&
125 (twp == NULL || (sysctl_tcp_tw_reuse &&
9d729f72 126 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
6d6ee43e
ACM
127 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128 if (tp->write_seq == 0)
129 tp->write_seq = 1;
130 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
131 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
132 sock_hold(sktw);
133 return 1;
134 }
135
136 return 0;
137}
6d6ee43e
ACM
138EXPORT_SYMBOL_GPL(tcp_twsk_unique);
139
1da177e4
LT
140/* This will initiate an outgoing connection. */
141int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
142{
2d7192d6 143 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
144 struct inet_sock *inet = inet_sk(sk);
145 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 146 __be16 orig_sport, orig_dport;
bada8adc 147 __be32 daddr, nexthop;
da905bd1 148 struct flowi4 *fl4;
2d7192d6 149 struct rtable *rt;
1da177e4 150 int err;
f6d8bd05 151 struct ip_options_rcu *inet_opt;
1da177e4
LT
152
153 if (addr_len < sizeof(struct sockaddr_in))
154 return -EINVAL;
155
156 if (usin->sin_family != AF_INET)
157 return -EAFNOSUPPORT;
158
159 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05
ED
160 inet_opt = rcu_dereference_protected(inet->inet_opt,
161 sock_owned_by_user(sk));
162 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
163 if (!daddr)
164 return -EINVAL;
f6d8bd05 165 nexthop = inet_opt->opt.faddr;
1da177e4
LT
166 }
167
dca8b089
DM
168 orig_sport = inet->inet_sport;
169 orig_dport = usin->sin_port;
da905bd1
DM
170 fl4 = &inet->cork.fl.u.ip4;
171 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
172 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173 IPPROTO_TCP,
0e0d44ab 174 orig_sport, orig_dport, sk);
b23dd4fe
DM
175 if (IS_ERR(rt)) {
176 err = PTR_ERR(rt);
177 if (err == -ENETUNREACH)
f1d8cba6 178 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 179 return err;
584bdf8c 180 }
1da177e4
LT
181
182 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
183 ip_rt_put(rt);
184 return -ENETUNREACH;
185 }
186
f6d8bd05 187 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 188 daddr = fl4->daddr;
1da177e4 189
c720c7e8 190 if (!inet->inet_saddr)
da905bd1 191 inet->inet_saddr = fl4->saddr;
d1e559d0 192 sk_rcv_saddr_set(sk, inet->inet_saddr);
1da177e4 193
c720c7e8 194 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
195 /* Reset inherited state */
196 tp->rx_opt.ts_recent = 0;
197 tp->rx_opt.ts_recent_stamp = 0;
ee995283
PE
198 if (likely(!tp->repair))
199 tp->write_seq = 0;
1da177e4
LT
200 }
201
295ff7ed 202 if (tcp_death_row.sysctl_tw_recycle &&
81166dd6
DM
203 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
204 tcp_fetch_timewait_stamp(sk, &rt->dst);
1da177e4 205
c720c7e8 206 inet->inet_dport = usin->sin_port;
d1e559d0 207 sk_daddr_set(sk, daddr);
1da177e4 208
d83d8461 209 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
210 if (inet_opt)
211 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 212
bee7ca9e 213 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
214
215 /* Socket identity is still unknown (sport may be zero).
216 * However we set state to SYN-SENT and not releasing socket
217 * lock select source port, enter ourselves into the hash tables and
218 * complete initialization after this.
219 */
220 tcp_set_state(sk, TCP_SYN_SENT);
a7f5e7f1 221 err = inet_hash_connect(&tcp_death_row, sk);
1da177e4
LT
222 if (err)
223 goto failure;
224
9e7ceb06
SP
225 inet_set_txhash(sk);
226
da905bd1 227 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
228 inet->inet_sport, inet->inet_dport, sk);
229 if (IS_ERR(rt)) {
230 err = PTR_ERR(rt);
231 rt = NULL;
1da177e4 232 goto failure;
b23dd4fe 233 }
1da177e4 234 /* OK, now commit destination to socket. */
bcd76111 235 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 236 sk_setup_caps(sk, &rt->dst);
1da177e4 237
ee995283 238 if (!tp->write_seq && likely(!tp->repair))
c720c7e8
ED
239 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
240 inet->inet_daddr,
241 inet->inet_sport,
1da177e4
LT
242 usin->sin_port);
243
c720c7e8 244 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4 245
2b916477 246 err = tcp_connect(sk);
ee995283 247
1da177e4
LT
248 rt = NULL;
249 if (err)
250 goto failure;
251
252 return 0;
253
254failure:
7174259e
ACM
255 /*
256 * This unhashes the socket and releases the local port,
257 * if necessary.
258 */
1da177e4
LT
259 tcp_set_state(sk, TCP_CLOSE);
260 ip_rt_put(rt);
261 sk->sk_route_caps = 0;
c720c7e8 262 inet->inet_dport = 0;
1da177e4
LT
263 return err;
264}
4bc2f18b 265EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 266
1da177e4 267/*
563d34d0
ED
268 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269 * It can be called through tcp_release_cb() if socket was owned by user
270 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 271 */
4fab9071 272void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4
LT
273{
274 struct dst_entry *dst;
275 struct inet_sock *inet = inet_sk(sk);
563d34d0 276 u32 mtu = tcp_sk(sk)->mtu_info;
1da177e4 277
80d0a69f
DM
278 dst = inet_csk_update_pmtu(sk, mtu);
279 if (!dst)
1da177e4
LT
280 return;
281
1da177e4
LT
282 /* Something is about to be wrong... Remember soft error
283 * for the case, if this connection will not able to recover.
284 */
285 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286 sk->sk_err_soft = EMSGSIZE;
287
288 mtu = dst_mtu(dst);
289
290 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
482fc609 291 ip_sk_accept_pmtu(sk) &&
d83d8461 292 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
293 tcp_sync_mss(sk, mtu);
294
295 /* Resend the TCP packet because it's
296 * clear that the old packet has been
297 * dropped. This is the new "fast" path mtu
298 * discovery.
299 */
300 tcp_simple_retransmit(sk);
301 } /* else let the usual retransmit timer handle it */
302}
4fab9071 303EXPORT_SYMBOL(tcp_v4_mtu_reduced);
1da177e4 304
55be7a9c
DM
305static void do_redirect(struct sk_buff *skb, struct sock *sk)
306{
307 struct dst_entry *dst = __sk_dst_check(sk, 0);
308
1ed5c48f 309 if (dst)
6700c270 310 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
311}
312
1da177e4
LT
313/*
314 * This routine is called by the ICMP module when it gets some
315 * sort of error condition. If err < 0 then the socket should
316 * be closed and the error returned to the user. If err > 0
317 * it's just the icmp type << 8 | icmp code. After adjustment
318 * header points to the first 8 bytes of the tcp header. We need
319 * to find the appropriate port.
320 *
321 * The locking strategy used here is very "optimistic". When
322 * someone else accesses the socket the ICMP is just dropped
323 * and for some paths there is no check at all.
324 * A more general error queue to queue errors for later handling
325 * is probably better.
326 *
327 */
328
4d1a2d9e 329void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 330{
b71d1d42 331 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 332 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 333 struct inet_connection_sock *icsk;
1da177e4
LT
334 struct tcp_sock *tp;
335 struct inet_sock *inet;
4d1a2d9e
DL
336 const int type = icmp_hdr(icmp_skb)->type;
337 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 338 struct sock *sk;
f1ecd5d9 339 struct sk_buff *skb;
0a672f74
YC
340 struct request_sock *fastopen;
341 __u32 seq, snd_una;
f1ecd5d9 342 __u32 remaining;
1da177e4 343 int err;
4d1a2d9e 344 struct net *net = dev_net(icmp_skb->dev);
1da177e4 345
fd54d716 346 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
4d1a2d9e 347 iph->saddr, th->source, inet_iif(icmp_skb));
1da177e4 348 if (!sk) {
dcfc23ca 349 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
350 return;
351 }
352 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 353 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
354 return;
355 }
356
357 bh_lock_sock(sk);
358 /* If too many ICMPs get dropped on busy
359 * servers this needs to be solved differently.
563d34d0
ED
360 * We do take care of PMTU discovery (RFC1191) special case :
361 * we can receive locally generated ICMP messages while socket is held.
1da177e4 362 */
b74aa930
ED
363 if (sock_owned_by_user(sk)) {
364 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
365 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
366 }
1da177e4
LT
367 if (sk->sk_state == TCP_CLOSE)
368 goto out;
369
97e3ecd1 370 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
371 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
372 goto out;
373 }
374
f1ecd5d9 375 icsk = inet_csk(sk);
1da177e4
LT
376 tp = tcp_sk(sk);
377 seq = ntohl(th->seq);
0a672f74
YC
378 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
379 fastopen = tp->fastopen_rsk;
380 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
1da177e4 381 if (sk->sk_state != TCP_LISTEN &&
0a672f74 382 !between(seq, snd_una, tp->snd_nxt)) {
de0744af 383 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
384 goto out;
385 }
386
387 switch (type) {
55be7a9c
DM
388 case ICMP_REDIRECT:
389 do_redirect(icmp_skb, sk);
390 goto out;
1da177e4
LT
391 case ICMP_SOURCE_QUENCH:
392 /* Just silently ignore these. */
393 goto out;
394 case ICMP_PARAMETERPROB:
395 err = EPROTO;
396 break;
397 case ICMP_DEST_UNREACH:
398 if (code > NR_ICMP_UNREACH)
399 goto out;
400
401 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
402 /* We are not interested in TCP_LISTEN and open_requests
403 * (SYN-ACKs send out by Linux are always <576bytes so
404 * they should go through unfragmented).
405 */
406 if (sk->sk_state == TCP_LISTEN)
407 goto out;
408
563d34d0 409 tp->mtu_info = info;
144d56e9 410 if (!sock_owned_by_user(sk)) {
563d34d0 411 tcp_v4_mtu_reduced(sk);
144d56e9
ED
412 } else {
413 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
414 sock_hold(sk);
415 }
1da177e4
LT
416 goto out;
417 }
418
419 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
420 /* check if icmp_skb allows revert of backoff
421 * (see draft-zimmermann-tcp-lcd) */
422 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
423 break;
424 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
0a672f74 425 !icsk->icsk_backoff || fastopen)
f1ecd5d9
DL
426 break;
427
8f49c270
DM
428 if (sock_owned_by_user(sk))
429 break;
430
f1ecd5d9 431 icsk->icsk_backoff--;
fcdd1cf4
ED
432 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
433 TCP_TIMEOUT_INIT;
434 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
f1ecd5d9
DL
435
436 skb = tcp_write_queue_head(sk);
437 BUG_ON(!skb);
438
7faee5c0
ED
439 remaining = icsk->icsk_rto -
440 min(icsk->icsk_rto,
441 tcp_time_stamp - tcp_skb_timestamp(skb));
f1ecd5d9
DL
442
443 if (remaining) {
444 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
445 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
446 } else {
447 /* RTO revert clocked out retransmission.
448 * Will retransmit now */
449 tcp_retransmit_timer(sk);
450 }
451
1da177e4
LT
452 break;
453 case ICMP_TIME_EXCEEDED:
454 err = EHOSTUNREACH;
455 break;
456 default:
457 goto out;
458 }
459
460 switch (sk->sk_state) {
52452c54 461 struct request_sock *req;
1da177e4
LT
462 case TCP_LISTEN:
463 if (sock_owned_by_user(sk))
464 goto out;
465
52452c54 466 req = inet_csk_search_req(sk, th->dest,
463c84b9 467 iph->daddr, iph->saddr);
1da177e4
LT
468 if (!req)
469 goto out;
470
471 /* ICMPs are not backlogged, hence we cannot get
472 an established socket here.
473 */
547b792c 474 WARN_ON(req->sk);
1da177e4 475
2e6599cb 476 if (seq != tcp_rsk(req)->snt_isn) {
de0744af 477 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
fa76ce73 478 reqsk_put(req);
1da177e4
LT
479 goto out;
480 }
481
482 /*
483 * Still in SYN_RECV, just remove it silently.
484 * There is no good way to pass the error to the newly
485 * created socket, and POSIX does not want network
486 * errors returned from accept().
487 */
52452c54 488 inet_csk_reqsk_queue_drop(sk, req);
848bf15f 489 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
fa76ce73 490 reqsk_put(req);
1da177e4
LT
491 goto out;
492
493 case TCP_SYN_SENT:
0a672f74
YC
494 case TCP_SYN_RECV:
495 /* Only in fast or simultaneous open. If a fast open socket is
496 * is already accepted it is treated as a connected one below.
497 */
498 if (fastopen && fastopen->sk == NULL)
499 break;
500
1da177e4 501 if (!sock_owned_by_user(sk)) {
1da177e4
LT
502 sk->sk_err = err;
503
504 sk->sk_error_report(sk);
505
506 tcp_done(sk);
507 } else {
508 sk->sk_err_soft = err;
509 }
510 goto out;
511 }
512
513 /* If we've already connected we will keep trying
514 * until we time out, or the user gives up.
515 *
516 * rfc1122 4.2.3.9 allows to consider as hard errors
517 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
518 * but it is obsoleted by pmtu discovery).
519 *
520 * Note, that in modern internet, where routing is unreliable
521 * and in each dark corner broken firewalls sit, sending random
522 * errors ordered by their masters even this two messages finally lose
523 * their original sense (even Linux sends invalid PORT_UNREACHs)
524 *
525 * Now we are in compliance with RFCs.
526 * --ANK (980905)
527 */
528
529 inet = inet_sk(sk);
530 if (!sock_owned_by_user(sk) && inet->recverr) {
531 sk->sk_err = err;
532 sk->sk_error_report(sk);
533 } else { /* Only an error on timeout */
534 sk->sk_err_soft = err;
535 }
536
537out:
538 bh_unlock_sock(sk);
539 sock_put(sk);
540}
541
28850dc7 542void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 543{
aa8223c7 544 struct tcphdr *th = tcp_hdr(skb);
1da177e4 545
84fa7933 546 if (skb->ip_summed == CHECKSUM_PARTIAL) {
419f9f89 547 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
663ead3b 548 skb->csum_start = skb_transport_header(skb) - skb->head;
ff1dcadb 549 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4 550 } else {
419f9f89 551 th->check = tcp_v4_check(skb->len, saddr, daddr,
07f0757a 552 csum_partial(th,
1da177e4
LT
553 th->doff << 2,
554 skb->csum));
555 }
556}
557
419f9f89 558/* This routine computes an IPv4 TCP checksum. */
bb296246 559void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 560{
cf533ea5 561 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
562
563 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
564}
4bc2f18b 565EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 566
1da177e4
LT
567/*
568 * This routine will send an RST to the other tcp.
569 *
570 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
571 * for reset.
572 * Answer: if a packet caused RST, it is not for a socket
573 * existing in our system, if it is matched to a socket,
574 * it is just duplicate segment or bug in other side's TCP.
575 * So that we build reply only basing on parameters
576 * arrived with segment.
577 * Exception: precedence violation. We do not implement it in any case.
578 */
579
cfb6eeb4 580static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
1da177e4 581{
cf533ea5 582 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
583 struct {
584 struct tcphdr th;
585#ifdef CONFIG_TCP_MD5SIG
714e85be 586 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
587#endif
588 } rep;
1da177e4 589 struct ip_reply_arg arg;
cfb6eeb4
YH
590#ifdef CONFIG_TCP_MD5SIG
591 struct tcp_md5sig_key *key;
658ddaaf
SL
592 const __u8 *hash_location = NULL;
593 unsigned char newhash[16];
594 int genhash;
595 struct sock *sk1 = NULL;
cfb6eeb4 596#endif
a86b1e30 597 struct net *net;
1da177e4
LT
598
599 /* Never send a reset in response to a reset. */
600 if (th->rst)
601 return;
602
c3658e8d
ED
603 /* If sk not NULL, it means we did a successful lookup and incoming
604 * route had to be correct. prequeue might have dropped our dst.
605 */
606 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
607 return;
608
609 /* Swap the send and the receive. */
cfb6eeb4
YH
610 memset(&rep, 0, sizeof(rep));
611 rep.th.dest = th->source;
612 rep.th.source = th->dest;
613 rep.th.doff = sizeof(struct tcphdr) / 4;
614 rep.th.rst = 1;
1da177e4
LT
615
616 if (th->ack) {
cfb6eeb4 617 rep.th.seq = th->ack_seq;
1da177e4 618 } else {
cfb6eeb4
YH
619 rep.th.ack = 1;
620 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
621 skb->len - (th->doff << 2));
1da177e4
LT
622 }
623
7174259e 624 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
625 arg.iov[0].iov_base = (unsigned char *)&rep;
626 arg.iov[0].iov_len = sizeof(rep.th);
627
0f85feae 628 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
cfb6eeb4 629#ifdef CONFIG_TCP_MD5SIG
658ddaaf
SL
630 hash_location = tcp_parse_md5sig_option(th);
631 if (!sk && hash_location) {
632 /*
633 * active side is lost. Try to find listening socket through
634 * source port, and then find md5 key through listening socket.
635 * we are not loose security here:
636 * Incoming packet is checked with md5 hash with finding key,
637 * no RST generated if md5 hash doesn't match.
638 */
0f85feae 639 sk1 = __inet_lookup_listener(net,
da5e3630
TH
640 &tcp_hashinfo, ip_hdr(skb)->saddr,
641 th->source, ip_hdr(skb)->daddr,
658ddaaf
SL
642 ntohs(th->source), inet_iif(skb));
643 /* don't send rst if it can't find key */
644 if (!sk1)
645 return;
646 rcu_read_lock();
647 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
648 &ip_hdr(skb)->saddr, AF_INET);
649 if (!key)
650 goto release_sk1;
651
652 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
653 if (genhash || memcmp(hash_location, newhash, 16) != 0)
654 goto release_sk1;
655 } else {
656 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
657 &ip_hdr(skb)->saddr,
658 AF_INET) : NULL;
659 }
660
cfb6eeb4
YH
661 if (key) {
662 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
663 (TCPOPT_NOP << 16) |
664 (TCPOPT_MD5SIG << 8) |
665 TCPOLEN_MD5SIG);
666 /* Update length and the length the header thinks exists */
667 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
668 rep.th.doff = arg.iov[0].iov_len / 4;
669
49a72dfb 670 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
671 key, ip_hdr(skb)->saddr,
672 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
673 }
674#endif
eddc9ec5
ACM
675 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
676 ip_hdr(skb)->saddr, /* XXX */
52cd5750 677 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 678 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
88ef4a5a 679 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
e2446eaa 680 /* When socket is gone, all binding information is lost.
4c675258
AK
681 * routing might fail in this case. No choice here, if we choose to force
682 * input interface, we will misroute in case of asymmetric route.
e2446eaa 683 */
4c675258
AK
684 if (sk)
685 arg.bound_dev_if = sk->sk_bound_dev_if;
1da177e4 686
66b13d99 687 arg.tos = ip_hdr(skb)->tos;
bdbbb852
ED
688 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
689 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d
ED
690 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
691 &arg, arg.iov[0].iov_len);
1da177e4 692
63231bdd
PE
693 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
694 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
658ddaaf
SL
695
696#ifdef CONFIG_TCP_MD5SIG
697release_sk1:
698 if (sk1) {
699 rcu_read_unlock();
700 sock_put(sk1);
701 }
702#endif
1da177e4
LT
703}
704
705/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
706 outside socket context is ugly, certainly. What can I do?
707 */
708
9501f972 709static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 710 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 711 struct tcp_md5sig_key *key,
66b13d99 712 int reply_flags, u8 tos)
1da177e4 713{
cf533ea5 714 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
715 struct {
716 struct tcphdr th;
714e85be 717 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 718#ifdef CONFIG_TCP_MD5SIG
714e85be 719 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
720#endif
721 ];
1da177e4
LT
722 } rep;
723 struct ip_reply_arg arg;
adf30907 724 struct net *net = dev_net(skb_dst(skb)->dev);
1da177e4
LT
725
726 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 727 memset(&arg, 0, sizeof(arg));
1da177e4
LT
728
729 arg.iov[0].iov_base = (unsigned char *)&rep;
730 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 731 if (tsecr) {
cfb6eeb4
YH
732 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
733 (TCPOPT_TIMESTAMP << 8) |
734 TCPOLEN_TIMESTAMP);
ee684b6f
AV
735 rep.opt[1] = htonl(tsval);
736 rep.opt[2] = htonl(tsecr);
cb48cfe8 737 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
738 }
739
740 /* Swap the send and the receive. */
741 rep.th.dest = th->source;
742 rep.th.source = th->dest;
743 rep.th.doff = arg.iov[0].iov_len / 4;
744 rep.th.seq = htonl(seq);
745 rep.th.ack_seq = htonl(ack);
746 rep.th.ack = 1;
747 rep.th.window = htons(win);
748
cfb6eeb4 749#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 750 if (key) {
ee684b6f 751 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
752
753 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
754 (TCPOPT_NOP << 16) |
755 (TCPOPT_MD5SIG << 8) |
756 TCPOLEN_MD5SIG);
757 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
758 rep.th.doff = arg.iov[0].iov_len/4;
759
49a72dfb 760 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
761 key, ip_hdr(skb)->saddr,
762 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
763 }
764#endif
88ef4a5a 765 arg.flags = reply_flags;
eddc9ec5
ACM
766 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
767 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
768 arg.iov[0].iov_len, IPPROTO_TCP, 0);
769 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
770 if (oif)
771 arg.bound_dev_if = oif;
66b13d99 772 arg.tos = tos;
bdbbb852
ED
773 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
774 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d
ED
775 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
776 &arg, arg.iov[0].iov_len);
1da177e4 777
63231bdd 778 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
1da177e4
LT
779}
780
781static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
782{
8feaf0c0 783 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 784 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 785
9501f972 786 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 787 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
ee684b6f 788 tcp_time_stamp + tcptw->tw_ts_offset,
9501f972
YH
789 tcptw->tw_ts_recent,
790 tw->tw_bound_dev_if,
88ef4a5a 791 tcp_twsk_md5_key(tcptw),
66b13d99
ED
792 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
793 tw->tw_tos
9501f972 794 );
1da177e4 795
8feaf0c0 796 inet_twsk_put(tw);
1da177e4
LT
797}
798
6edafaaf 799static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
7174259e 800 struct request_sock *req)
1da177e4 801{
168a8f58
JC
802 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
803 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
804 */
805 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
806 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
807 tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
ee684b6f 808 tcp_time_stamp,
9501f972
YH
809 req->ts_recent,
810 0,
a915da9b
ED
811 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
812 AF_INET),
66b13d99
ED
813 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
814 ip_hdr(skb)->tos);
1da177e4
LT
815}
816
1da177e4 817/*
9bf1d83e 818 * Send a SYN-ACK after having received a SYN.
60236fdd 819 * This still operates on a request_sock only, not on a big
1da177e4
LT
820 * socket.
821 */
72659ecc 822static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
d6274bd8 823 struct flowi *fl,
72659ecc 824 struct request_sock *req,
843f4a55
YC
825 u16 queue_mapping,
826 struct tcp_fastopen_cookie *foc)
1da177e4 827{
2e6599cb 828 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 829 struct flowi4 fl4;
1da177e4 830 int err = -1;
d41db5af 831 struct sk_buff *skb;
1da177e4
LT
832
833 /* First, grab a route. */
ba3f7f04 834 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 835 return -1;
1da177e4 836
843f4a55 837 skb = tcp_make_synack(sk, dst, req, foc);
1da177e4
LT
838
839 if (skb) {
634fb979 840 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1da177e4 841
fff32699 842 skb_set_queue_mapping(skb, queue_mapping);
634fb979
ED
843 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
844 ireq->ir_rmt_addr,
2e6599cb 845 ireq->opt);
b9df3cb8 846 err = net_xmit_eval(err);
1da177e4
LT
847 }
848
1da177e4
LT
849 return err;
850}
851
852/*
60236fdd 853 * IPv4 request_sock destructor.
1da177e4 854 */
60236fdd 855static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 856{
a51482bd 857 kfree(inet_rsk(req)->opt);
1da177e4
LT
858}
859
946cedcc 860/*
a2a385d6 861 * Return true if a syncookie should be sent
946cedcc 862 */
a2a385d6 863bool tcp_syn_flood_action(struct sock *sk,
946cedcc
ED
864 const struct sk_buff *skb,
865 const char *proto)
1da177e4 866{
946cedcc 867 const char *msg = "Dropping request";
a2a385d6 868 bool want_cookie = false;
946cedcc
ED
869 struct listen_sock *lopt;
870
2a1d4bd4 871#ifdef CONFIG_SYN_COOKIES
946cedcc 872 if (sysctl_tcp_syncookies) {
2a1d4bd4 873 msg = "Sending cookies";
a2a385d6 874 want_cookie = true;
946cedcc
ED
875 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
876 } else
80e40daa 877#endif
946cedcc
ED
878 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
879
880 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
5ad37d5d 881 if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
946cedcc 882 lopt->synflood_warned = 1;
afd46503 883 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
946cedcc
ED
884 proto, ntohs(tcp_hdr(skb)->dest), msg);
885 }
886 return want_cookie;
2a1d4bd4 887}
946cedcc 888EXPORT_SYMBOL(tcp_syn_flood_action);
1da177e4 889
cfb6eeb4
YH
890#ifdef CONFIG_TCP_MD5SIG
891/*
892 * RFC2385 MD5 checksumming requires a mapping of
893 * IP address->MD5 Key.
894 * We need to maintain these in the sk structure.
895 */
896
897/* Find the Key structure for an address. */
a915da9b
ED
898struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
899 const union tcp_md5_addr *addr,
900 int family)
cfb6eeb4
YH
901{
902 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 903 struct tcp_md5sig_key *key;
a915da9b 904 unsigned int size = sizeof(struct in_addr);
a8afca03 905 struct tcp_md5sig_info *md5sig;
cfb6eeb4 906
a8afca03
ED
907 /* caller either holds rcu_read_lock() or socket lock */
908 md5sig = rcu_dereference_check(tp->md5sig_info,
b4fb05ea
ED
909 sock_owned_by_user(sk) ||
910 lockdep_is_held(&sk->sk_lock.slock));
a8afca03 911 if (!md5sig)
cfb6eeb4 912 return NULL;
a915da9b
ED
913#if IS_ENABLED(CONFIG_IPV6)
914 if (family == AF_INET6)
915 size = sizeof(struct in6_addr);
916#endif
b67bfe0d 917 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
a915da9b
ED
918 if (key->family != family)
919 continue;
920 if (!memcmp(&key->addr, addr, size))
921 return key;
cfb6eeb4
YH
922 }
923 return NULL;
924}
a915da9b 925EXPORT_SYMBOL(tcp_md5_do_lookup);
cfb6eeb4
YH
926
927struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
928 struct sock *addr_sk)
929{
a915da9b
ED
930 union tcp_md5_addr *addr;
931
932 addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
933 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4 934}
cfb6eeb4
YH
935EXPORT_SYMBOL(tcp_v4_md5_lookup);
936
f5b99bcd
AB
937static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
938 struct request_sock *req)
cfb6eeb4 939{
a915da9b
ED
940 union tcp_md5_addr *addr;
941
634fb979 942 addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
a915da9b 943 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4
YH
944}
945
946/* This can be called on a newly created socket, from other files */
a915da9b
ED
947int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
948 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
cfb6eeb4
YH
949{
950 /* Add Key to the list */
b0a713e9 951 struct tcp_md5sig_key *key;
cfb6eeb4 952 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 953 struct tcp_md5sig_info *md5sig;
cfb6eeb4 954
c0353c7b 955 key = tcp_md5_do_lookup(sk, addr, family);
cfb6eeb4
YH
956 if (key) {
957 /* Pre-existing entry - just update that one. */
a915da9b 958 memcpy(key->key, newkey, newkeylen);
b0a713e9 959 key->keylen = newkeylen;
a915da9b
ED
960 return 0;
961 }
260fcbeb 962
a8afca03
ED
963 md5sig = rcu_dereference_protected(tp->md5sig_info,
964 sock_owned_by_user(sk));
a915da9b
ED
965 if (!md5sig) {
966 md5sig = kmalloc(sizeof(*md5sig), gfp);
967 if (!md5sig)
cfb6eeb4 968 return -ENOMEM;
cfb6eeb4 969
a915da9b
ED
970 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
971 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 972 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 973 }
cfb6eeb4 974
5f3d9cb2 975 key = sock_kmalloc(sk, sizeof(*key), gfp);
a915da9b
ED
976 if (!key)
977 return -ENOMEM;
71cea17e 978 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 979 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 980 return -ENOMEM;
cfb6eeb4 981 }
a915da9b
ED
982
983 memcpy(key->key, newkey, newkeylen);
984 key->keylen = newkeylen;
985 key->family = family;
986 memcpy(&key->addr, addr,
987 (family == AF_INET6) ? sizeof(struct in6_addr) :
988 sizeof(struct in_addr));
989 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
990 return 0;
991}
a915da9b 992EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 993
a915da9b 994int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
cfb6eeb4 995{
a915da9b
ED
996 struct tcp_md5sig_key *key;
997
c0353c7b 998 key = tcp_md5_do_lookup(sk, addr, family);
a915da9b
ED
999 if (!key)
1000 return -ENOENT;
1001 hlist_del_rcu(&key->node);
5f3d9cb2 1002 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1003 kfree_rcu(key, rcu);
a915da9b 1004 return 0;
cfb6eeb4 1005}
a915da9b 1006EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 1007
e0683e70 1008static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
1009{
1010 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1011 struct tcp_md5sig_key *key;
b67bfe0d 1012 struct hlist_node *n;
a8afca03 1013 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1014
a8afca03
ED
1015 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1016
b67bfe0d 1017 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1018 hlist_del_rcu(&key->node);
5f3d9cb2 1019 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1020 kfree_rcu(key, rcu);
cfb6eeb4
YH
1021 }
1022}
1023
7174259e
ACM
1024static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1025 int optlen)
cfb6eeb4
YH
1026{
1027 struct tcp_md5sig cmd;
1028 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
cfb6eeb4
YH
1029
1030 if (optlen < sizeof(cmd))
1031 return -EINVAL;
1032
7174259e 1033 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1034 return -EFAULT;
1035
1036 if (sin->sin_family != AF_INET)
1037 return -EINVAL;
1038
64a124ed 1039 if (!cmd.tcpm_keylen)
a915da9b
ED
1040 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1041 AF_INET);
cfb6eeb4
YH
1042
1043 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1044 return -EINVAL;
1045
a915da9b
ED
1046 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1047 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1048 GFP_KERNEL);
cfb6eeb4
YH
1049}
1050
49a72dfb
AL
1051static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1052 __be32 daddr, __be32 saddr, int nbytes)
cfb6eeb4 1053{
cfb6eeb4 1054 struct tcp4_pseudohdr *bp;
49a72dfb 1055 struct scatterlist sg;
cfb6eeb4
YH
1056
1057 bp = &hp->md5_blk.ip4;
cfb6eeb4
YH
1058
1059 /*
49a72dfb 1060 * 1. the TCP pseudo-header (in the order: source IP address,
cfb6eeb4
YH
1061 * destination IP address, zero-padded protocol number, and
1062 * segment length)
1063 */
1064 bp->saddr = saddr;
1065 bp->daddr = daddr;
1066 bp->pad = 0;
076fb722 1067 bp->protocol = IPPROTO_TCP;
49a72dfb 1068 bp->len = cpu_to_be16(nbytes);
c7da57a1 1069
49a72dfb
AL
1070 sg_init_one(&sg, bp, sizeof(*bp));
1071 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1072}
1073
a915da9b 1074static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1075 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1076{
1077 struct tcp_md5sig_pool *hp;
1078 struct hash_desc *desc;
1079
1080 hp = tcp_get_md5sig_pool();
1081 if (!hp)
1082 goto clear_hash_noput;
1083 desc = &hp->md5_desc;
1084
1085 if (crypto_hash_init(desc))
1086 goto clear_hash;
1087 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1088 goto clear_hash;
1089 if (tcp_md5_hash_header(hp, th))
1090 goto clear_hash;
1091 if (tcp_md5_hash_key(hp, key))
1092 goto clear_hash;
1093 if (crypto_hash_final(desc, md5_hash))
cfb6eeb4
YH
1094 goto clear_hash;
1095
cfb6eeb4 1096 tcp_put_md5sig_pool();
cfb6eeb4 1097 return 0;
49a72dfb 1098
cfb6eeb4
YH
1099clear_hash:
1100 tcp_put_md5sig_pool();
1101clear_hash_noput:
1102 memset(md5_hash, 0, 16);
49a72dfb 1103 return 1;
cfb6eeb4
YH
1104}
1105
49a72dfb 1106int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
318cf7aa
ED
1107 const struct sock *sk, const struct request_sock *req,
1108 const struct sk_buff *skb)
cfb6eeb4 1109{
49a72dfb
AL
1110 struct tcp_md5sig_pool *hp;
1111 struct hash_desc *desc;
318cf7aa 1112 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1113 __be32 saddr, daddr;
1114
1115 if (sk) {
c720c7e8
ED
1116 saddr = inet_sk(sk)->inet_saddr;
1117 daddr = inet_sk(sk)->inet_daddr;
49a72dfb 1118 } else if (req) {
634fb979
ED
1119 saddr = inet_rsk(req)->ir_loc_addr;
1120 daddr = inet_rsk(req)->ir_rmt_addr;
cfb6eeb4 1121 } else {
49a72dfb
AL
1122 const struct iphdr *iph = ip_hdr(skb);
1123 saddr = iph->saddr;
1124 daddr = iph->daddr;
cfb6eeb4 1125 }
49a72dfb
AL
1126
1127 hp = tcp_get_md5sig_pool();
1128 if (!hp)
1129 goto clear_hash_noput;
1130 desc = &hp->md5_desc;
1131
1132 if (crypto_hash_init(desc))
1133 goto clear_hash;
1134
1135 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1136 goto clear_hash;
1137 if (tcp_md5_hash_header(hp, th))
1138 goto clear_hash;
1139 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1140 goto clear_hash;
1141 if (tcp_md5_hash_key(hp, key))
1142 goto clear_hash;
1143 if (crypto_hash_final(desc, md5_hash))
1144 goto clear_hash;
1145
1146 tcp_put_md5sig_pool();
1147 return 0;
1148
1149clear_hash:
1150 tcp_put_md5sig_pool();
1151clear_hash_noput:
1152 memset(md5_hash, 0, 16);
1153 return 1;
cfb6eeb4 1154}
49a72dfb 1155EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1156
9ea88a15
DP
1157static bool __tcp_v4_inbound_md5_hash(struct sock *sk,
1158 const struct sk_buff *skb)
cfb6eeb4
YH
1159{
1160 /*
1161 * This gets called for each TCP segment that arrives
1162 * so we want to be efficient.
1163 * We have 3 drop cases:
1164 * o No MD5 hash and one expected.
1165 * o MD5 hash and we're not expecting one.
1166 * o MD5 hash and its wrong.
1167 */
cf533ea5 1168 const __u8 *hash_location = NULL;
cfb6eeb4 1169 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1170 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1171 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1172 int genhash;
cfb6eeb4
YH
1173 unsigned char newhash[16];
1174
a915da9b
ED
1175 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1176 AF_INET);
7d5d5525 1177 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1178
cfb6eeb4
YH
1179 /* We've parsed the options - do we have a hash? */
1180 if (!hash_expected && !hash_location)
a2a385d6 1181 return false;
cfb6eeb4
YH
1182
1183 if (hash_expected && !hash_location) {
785957d3 1184 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1185 return true;
cfb6eeb4
YH
1186 }
1187
1188 if (!hash_expected && hash_location) {
785957d3 1189 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1190 return true;
cfb6eeb4
YH
1191 }
1192
1193 /* Okay, so this is hash_expected and hash_location -
1194 * so we need to calculate the checksum.
1195 */
49a72dfb
AL
1196 genhash = tcp_v4_md5_hash_skb(newhash,
1197 hash_expected,
1198 NULL, NULL, skb);
cfb6eeb4
YH
1199
1200 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
e87cc472
JP
1201 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1202 &iph->saddr, ntohs(th->source),
1203 &iph->daddr, ntohs(th->dest),
1204 genhash ? " tcp_v4_calc_md5_hash failed"
1205 : "");
a2a385d6 1206 return true;
cfb6eeb4 1207 }
a2a385d6 1208 return false;
cfb6eeb4
YH
1209}
1210
9ea88a15
DP
1211static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1212{
1213 bool ret;
1214
1215 rcu_read_lock();
1216 ret = __tcp_v4_inbound_md5_hash(sk, skb);
1217 rcu_read_unlock();
1218
1219 return ret;
1220}
1221
cfb6eeb4
YH
1222#endif
1223
08d2cc3b 1224static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener,
16bea70a
OP
1225 struct sk_buff *skb)
1226{
1227 struct inet_request_sock *ireq = inet_rsk(req);
1228
08d2cc3b
ED
1229 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1230 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1231 ireq->no_srccheck = inet_sk(sk_listener)->transparent;
16bea70a 1232 ireq->opt = tcp_v4_save_options(skb);
3f66b083 1233 ireq->ireq_family = AF_INET;
16bea70a
OP
1234}
1235
d94e0417
OP
1236static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
1237 const struct request_sock *req,
1238 bool *strict)
1239{
1240 struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1241
1242 if (strict) {
1243 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1244 *strict = true;
1245 else
1246 *strict = false;
1247 }
1248
1249 return dst;
1250}
1251
72a3effa 1252struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1253 .family = PF_INET,
2e6599cb 1254 .obj_size = sizeof(struct tcp_request_sock),
5db92c99 1255 .rtx_syn_ack = tcp_rtx_synack,
60236fdd
ACM
1256 .send_ack = tcp_v4_reqsk_send_ack,
1257 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1258 .send_reset = tcp_v4_send_reset,
688d1945 1259 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1260};
1261
b2e4b3de 1262static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
2aec4a29 1263 .mss_clamp = TCP_MSS_DEFAULT,
16bea70a 1264#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 1265 .md5_lookup = tcp_v4_reqsk_md5_lookup,
e3afe7b7 1266 .calc_md5_hash = tcp_v4_md5_hash_skb,
b6332e6c 1267#endif
16bea70a 1268 .init_req = tcp_v4_init_req,
fb7b37a7
OP
1269#ifdef CONFIG_SYN_COOKIES
1270 .cookie_init_seq = cookie_v4_init_sequence,
1271#endif
d94e0417 1272 .route_req = tcp_v4_route_req,
936b8bdb 1273 .init_seq = tcp_v4_init_sequence,
d6274bd8 1274 .send_synack = tcp_v4_send_synack,
695da14e 1275 .queue_hash_add = inet_csk_reqsk_queue_hash_add,
16bea70a 1276};
cfb6eeb4 1277
1da177e4
LT
1278int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1279{
1da177e4 1280 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1281 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1282 goto drop;
1283
1fb6f159
OP
1284 return tcp_conn_request(&tcp_request_sock_ops,
1285 &tcp_request_sock_ipv4_ops, sk, skb);
1da177e4 1286
1da177e4 1287drop:
848bf15f 1288 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4
LT
1289 return 0;
1290}
4bc2f18b 1291EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1292
1293
1294/*
1295 * The three way handshake has completed - we got a valid synack -
1296 * now create the new socket.
1297 */
1298struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
60236fdd 1299 struct request_sock *req,
1da177e4
LT
1300 struct dst_entry *dst)
1301{
2e6599cb 1302 struct inet_request_sock *ireq;
1da177e4
LT
1303 struct inet_sock *newinet;
1304 struct tcp_sock *newtp;
1305 struct sock *newsk;
cfb6eeb4
YH
1306#ifdef CONFIG_TCP_MD5SIG
1307 struct tcp_md5sig_key *key;
1308#endif
f6d8bd05 1309 struct ip_options_rcu *inet_opt;
1da177e4
LT
1310
1311 if (sk_acceptq_is_full(sk))
1312 goto exit_overflow;
1313
1da177e4
LT
1314 newsk = tcp_create_openreq_child(sk, req, skb);
1315 if (!newsk)
093d2823 1316 goto exit_nonewsk;
1da177e4 1317
bcd76111 1318 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1319 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1320
1321 newtp = tcp_sk(newsk);
1322 newinet = inet_sk(newsk);
2e6599cb 1323 ireq = inet_rsk(req);
d1e559d0
ED
1324 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1325 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
634fb979 1326 newinet->inet_saddr = ireq->ir_loc_addr;
f6d8bd05
ED
1327 inet_opt = ireq->opt;
1328 rcu_assign_pointer(newinet->inet_opt, inet_opt);
2e6599cb 1329 ireq->opt = NULL;
463c84b9 1330 newinet->mc_index = inet_iif(skb);
eddc9ec5 1331 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1332 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1333 inet_csk(newsk)->icsk_ext_hdr_len = 0;
b73c3d0e 1334 inet_set_txhash(newsk);
f6d8bd05
ED
1335 if (inet_opt)
1336 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
c720c7e8 1337 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1338
dfd25fff
ED
1339 if (!dst) {
1340 dst = inet_csk_route_child_sock(sk, newsk, req);
1341 if (!dst)
1342 goto put_and_exit;
1343 } else {
1344 /* syncookie case : see end of cookie_v4_check() */
1345 }
0e734419
DM
1346 sk_setup_caps(newsk, dst);
1347
81164413
DB
1348 tcp_ca_openreq_child(newsk, dst);
1349
1da177e4 1350 tcp_sync_mss(newsk, dst_mtu(dst));
0dbaee3b 1351 newtp->advmss = dst_metric_advmss(dst);
f5fff5dc
TQ
1352 if (tcp_sk(sk)->rx_opt.user_mss &&
1353 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1354 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1355
1da177e4
LT
1356 tcp_initialize_rcv_mss(newsk);
1357
cfb6eeb4
YH
1358#ifdef CONFIG_TCP_MD5SIG
1359 /* Copy over the MD5 key from the original socket */
a915da9b
ED
1360 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1361 AF_INET);
c720c7e8 1362 if (key != NULL) {
cfb6eeb4
YH
1363 /*
1364 * We're using one, so create a matching key
1365 * on the newsk structure. If we fail to get
1366 * memory, then we end up not copying the key
1367 * across. Shucks.
1368 */
a915da9b
ED
1369 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1370 AF_INET, key->key, key->keylen, GFP_ATOMIC);
a465419b 1371 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1372 }
1373#endif
1374
0e734419
DM
1375 if (__inet_inherit_port(sk, newsk) < 0)
1376 goto put_and_exit;
9327f705 1377 __inet_hash_nolisten(newsk, NULL);
1da177e4
LT
1378
1379 return newsk;
1380
1381exit_overflow:
de0744af 1382 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1383exit_nonewsk:
1384 dst_release(dst);
1da177e4 1385exit:
de0744af 1386 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4 1387 return NULL;
0e734419 1388put_and_exit:
e337e24d
CP
1389 inet_csk_prepare_forced_close(newsk);
1390 tcp_done(newsk);
0e734419 1391 goto exit;
1da177e4 1392}
4bc2f18b 1393EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4
LT
1394
1395static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1396{
52452c54 1397 const struct tcphdr *th = tcp_hdr(skb);
eddc9ec5 1398 const struct iphdr *iph = ip_hdr(skb);
52452c54 1399 struct request_sock *req;
1da177e4 1400 struct sock *nsk;
52452c54
ED
1401
1402 req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr);
fa76ce73
ED
1403 if (req) {
1404 nsk = tcp_check_req(sk, skb, req, false);
1405 reqsk_put(req);
1406 return nsk;
1407 }
1da177e4 1408
3b1e0a65 1409 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
c67499c0 1410 th->source, iph->daddr, th->dest, inet_iif(skb));
1da177e4
LT
1411
1412 if (nsk) {
1413 if (nsk->sk_state != TCP_TIME_WAIT) {
1414 bh_lock_sock(nsk);
1415 return nsk;
1416 }
9469c7b4 1417 inet_twsk_put(inet_twsk(nsk));
1da177e4
LT
1418 return NULL;
1419 }
1420
1421#ifdef CONFIG_SYN_COOKIES
af9b4738 1422 if (!th->syn)
461b74c3 1423 sk = cookie_v4_check(sk, skb);
1da177e4
LT
1424#endif
1425 return sk;
1426}
1427
1da177e4
LT
1428/* The socket must have it's spinlock held when we get
1429 * here.
1430 *
1431 * We have a potential double-lock case here, so even when
1432 * doing backlog processing we use the BH locking scheme.
1433 * This is because we cannot sleep with the original spinlock
1434 * held.
1435 */
1436int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1437{
cfb6eeb4 1438 struct sock *rsk;
cfb6eeb4 1439
1da177e4 1440 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1441 struct dst_entry *dst = sk->sk_rx_dst;
1442
bdeab991 1443 sock_rps_save_rxhash(sk, skb);
3d97379a 1444 sk_mark_napi_id(sk, skb);
404e0a8b 1445 if (dst) {
505fbcf0
ED
1446 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1447 dst->ops->check(dst, 0) == NULL) {
92101b3b
DM
1448 dst_release(dst);
1449 sk->sk_rx_dst = NULL;
1450 }
1451 }
c995ae22 1452 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1da177e4
LT
1453 return 0;
1454 }
1455
ab6a5bb6 1456 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1da177e4
LT
1457 goto csum_err;
1458
1459 if (sk->sk_state == TCP_LISTEN) {
1460 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1461 if (!nsk)
1462 goto discard;
1463
1464 if (nsk != sk) {
bdeab991 1465 sock_rps_save_rxhash(nsk, skb);
3d97379a 1466 sk_mark_napi_id(sk, skb);
cfb6eeb4
YH
1467 if (tcp_child_process(sk, nsk, skb)) {
1468 rsk = nsk;
1da177e4 1469 goto reset;
cfb6eeb4 1470 }
1da177e4
LT
1471 return 0;
1472 }
ca55158c 1473 } else
bdeab991 1474 sock_rps_save_rxhash(sk, skb);
ca55158c 1475
aa8223c7 1476 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1477 rsk = sk;
1da177e4 1478 goto reset;
cfb6eeb4 1479 }
1da177e4
LT
1480 return 0;
1481
1482reset:
cfb6eeb4 1483 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1484discard:
1485 kfree_skb(skb);
1486 /* Be careful here. If this function gets more complicated and
1487 * gcc suffers from register pressure on the x86, sk (in %ebx)
1488 * might be destroyed here. This current version compiles correctly,
1489 * but you have been warned.
1490 */
1491 return 0;
1492
1493csum_err:
6a5dc9e5 1494 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
63231bdd 1495 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1496 goto discard;
1497}
4bc2f18b 1498EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1499
160eb5a6 1500void tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1501{
41063e9d
DM
1502 const struct iphdr *iph;
1503 const struct tcphdr *th;
1504 struct sock *sk;
41063e9d 1505
41063e9d 1506 if (skb->pkt_type != PACKET_HOST)
160eb5a6 1507 return;
41063e9d 1508
45f00f99 1509 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
160eb5a6 1510 return;
41063e9d
DM
1511
1512 iph = ip_hdr(skb);
45f00f99 1513 th = tcp_hdr(skb);
41063e9d
DM
1514
1515 if (th->doff < sizeof(struct tcphdr) / 4)
160eb5a6 1516 return;
41063e9d 1517
45f00f99 1518 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1519 iph->saddr, th->source,
7011d085 1520 iph->daddr, ntohs(th->dest),
9cb429d6 1521 skb->skb_iif);
41063e9d
DM
1522 if (sk) {
1523 skb->sk = sk;
1524 skb->destructor = sock_edemux;
f7e4eb03 1525 if (sk_fullsock(sk)) {
41063e9d 1526 struct dst_entry *dst = sk->sk_rx_dst;
505fbcf0 1527
41063e9d
DM
1528 if (dst)
1529 dst = dst_check(dst, 0);
92101b3b 1530 if (dst &&
505fbcf0 1531 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1532 skb_dst_set_noref(skb, dst);
41063e9d
DM
1533 }
1534 }
41063e9d
DM
1535}
1536
b2fb4f54
ED
1537/* Packet is added to VJ-style prequeue for processing in process
1538 * context, if a reader task is waiting. Apparently, this exciting
1539 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1540 * failed somewhere. Latency? Burstiness? Well, at least now we will
1541 * see, why it failed. 8)8) --ANK
1542 *
1543 */
1544bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1545{
1546 struct tcp_sock *tp = tcp_sk(sk);
1547
1548 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1549 return false;
1550
1551 if (skb->len <= tcp_hdrlen(skb) &&
1552 skb_queue_len(&tp->ucopy.prequeue) == 0)
1553 return false;
1554
ca777eff
ED
1555 /* Before escaping RCU protected region, we need to take care of skb
1556 * dst. Prequeue is only enabled for established sockets.
1557 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1558 * Instead of doing full sk_rx_dst validity here, let's perform
1559 * an optimistic check.
1560 */
1561 if (likely(sk->sk_rx_dst))
1562 skb_dst_drop(skb);
1563 else
1564 skb_dst_force(skb);
1565
b2fb4f54
ED
1566 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1567 tp->ucopy.memory += skb->truesize;
1568 if (tp->ucopy.memory > sk->sk_rcvbuf) {
1569 struct sk_buff *skb1;
1570
1571 BUG_ON(sock_owned_by_user(sk));
1572
1573 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1574 sk_backlog_rcv(sk, skb1);
1575 NET_INC_STATS_BH(sock_net(sk),
1576 LINUX_MIB_TCPPREQUEUEDROPPED);
1577 }
1578
1579 tp->ucopy.memory = 0;
1580 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1581 wake_up_interruptible_sync_poll(sk_sleep(sk),
1582 POLLIN | POLLRDNORM | POLLRDBAND);
1583 if (!inet_csk_ack_scheduled(sk))
1584 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1585 (3 * tcp_rto_min(sk)) / 4,
1586 TCP_RTO_MAX);
1587 }
1588 return true;
1589}
1590EXPORT_SYMBOL(tcp_prequeue);
1591
1da177e4
LT
1592/*
1593 * From tcp_input.c
1594 */
1595
1596int tcp_v4_rcv(struct sk_buff *skb)
1597{
eddc9ec5 1598 const struct iphdr *iph;
cf533ea5 1599 const struct tcphdr *th;
1da177e4
LT
1600 struct sock *sk;
1601 int ret;
a86b1e30 1602 struct net *net = dev_net(skb->dev);
1da177e4
LT
1603
1604 if (skb->pkt_type != PACKET_HOST)
1605 goto discard_it;
1606
1607 /* Count it even if it's bad */
63231bdd 1608 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1da177e4
LT
1609
1610 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1611 goto discard_it;
1612
aa8223c7 1613 th = tcp_hdr(skb);
1da177e4
LT
1614
1615 if (th->doff < sizeof(struct tcphdr) / 4)
1616 goto bad_packet;
1617 if (!pskb_may_pull(skb, th->doff * 4))
1618 goto discard_it;
1619
1620 /* An explanation is required here, I think.
1621 * Packet length and doff are validated by header prediction,
caa20d9a 1622 * provided case of th->doff==0 is eliminated.
1da177e4 1623 * So, we defer the checks. */
ed70fcfc
TH
1624
1625 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
6a5dc9e5 1626 goto csum_error;
1da177e4 1627
aa8223c7 1628 th = tcp_hdr(skb);
eddc9ec5 1629 iph = ip_hdr(skb);
971f10ec
ED
1630 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1631 * barrier() makes sure compiler wont play fool^Waliasing games.
1632 */
1633 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1634 sizeof(struct inet_skb_parm));
1635 barrier();
1636
1da177e4
LT
1637 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1638 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1639 skb->len - th->doff * 4);
1640 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
e11ecddf 1641 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
04317daf 1642 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
b82d1bb4 1643 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1da177e4
LT
1644 TCP_SKB_CB(skb)->sacked = 0;
1645
9a1f27c4 1646 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1da177e4
LT
1647 if (!sk)
1648 goto no_tcp_socket;
1649
bb134d5d
ED
1650process:
1651 if (sk->sk_state == TCP_TIME_WAIT)
1652 goto do_time_wait;
1653
6cce09f8
ED
1654 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1655 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1656 goto discard_and_relse;
6cce09f8 1657 }
d218d111 1658
1da177e4
LT
1659 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1660 goto discard_and_relse;
9ea88a15
DP
1661
1662#ifdef CONFIG_TCP_MD5SIG
1663 /*
1664 * We really want to reject the packet as early as possible
1665 * if:
1666 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1667 * o There is an MD5 option and we're not expecting one
1668 */
1669 if (tcp_v4_inbound_md5_hash(sk, skb))
1670 goto discard_and_relse;
1671#endif
1672
b59c2701 1673 nf_reset(skb);
1da177e4 1674
fda9ef5d 1675 if (sk_filter(sk, skb))
1da177e4
LT
1676 goto discard_and_relse;
1677
2c8c56e1 1678 sk_incoming_cpu_update(sk);
1da177e4
LT
1679 skb->dev = NULL;
1680
c6366184 1681 bh_lock_sock_nested(sk);
1da177e4
LT
1682 ret = 0;
1683 if (!sock_owned_by_user(sk)) {
7bced397 1684 if (!tcp_prequeue(sk, skb))
1da177e4 1685 ret = tcp_v4_do_rcv(sk, skb);
da882c1f
ED
1686 } else if (unlikely(sk_add_backlog(sk, skb,
1687 sk->sk_rcvbuf + sk->sk_sndbuf))) {
6b03a53a 1688 bh_unlock_sock(sk);
6cce09f8 1689 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
6b03a53a
ZY
1690 goto discard_and_relse;
1691 }
1da177e4
LT
1692 bh_unlock_sock(sk);
1693
1694 sock_put(sk);
1695
1696 return ret;
1697
1698no_tcp_socket:
1699 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1700 goto discard_it;
1701
1702 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
6a5dc9e5
ED
1703csum_error:
1704 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1da177e4 1705bad_packet:
63231bdd 1706 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1da177e4 1707 } else {
cfb6eeb4 1708 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
1709 }
1710
1711discard_it:
1712 /* Discard frame. */
1713 kfree_skb(skb);
e905a9ed 1714 return 0;
1da177e4
LT
1715
1716discard_and_relse:
1717 sock_put(sk);
1718 goto discard_it;
1719
1720do_time_wait:
1721 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 1722 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1723 goto discard_it;
1724 }
1725
6a5dc9e5 1726 if (skb->len < (th->doff << 2)) {
9469c7b4 1727 inet_twsk_put(inet_twsk(sk));
6a5dc9e5
ED
1728 goto bad_packet;
1729 }
1730 if (tcp_checksum_complete(skb)) {
1731 inet_twsk_put(inet_twsk(sk));
1732 goto csum_error;
1da177e4 1733 }
9469c7b4 1734 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 1735 case TCP_TW_SYN: {
c346dca1 1736 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
c67499c0 1737 &tcp_hashinfo,
da5e3630 1738 iph->saddr, th->source,
eddc9ec5 1739 iph->daddr, th->dest,
463c84b9 1740 inet_iif(skb));
1da177e4 1741 if (sk2) {
9469c7b4
YH
1742 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1743 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1744 sk = sk2;
1745 goto process;
1746 }
1747 /* Fall through to ACK */
1748 }
1749 case TCP_TW_ACK:
1750 tcp_v4_timewait_ack(sk, skb);
1751 break;
1752 case TCP_TW_RST:
1753 goto no_tcp_socket;
1754 case TCP_TW_SUCCESS:;
1755 }
1756 goto discard_it;
1757}
1758
ccb7c410
DM
1759static struct timewait_sock_ops tcp_timewait_sock_ops = {
1760 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1761 .twsk_unique = tcp_twsk_unique,
1762 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 1763};
1da177e4 1764
63d02d15 1765void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
1766{
1767 struct dst_entry *dst = skb_dst(skb);
1768
ca777eff
ED
1769 if (dst) {
1770 dst_hold(dst);
1771 sk->sk_rx_dst = dst;
1772 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1773 }
5d299f3d 1774}
63d02d15 1775EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 1776
3b401a81 1777const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
1778 .queue_xmit = ip_queue_xmit,
1779 .send_check = tcp_v4_send_check,
1780 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 1781 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
1782 .conn_request = tcp_v4_conn_request,
1783 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
1784 .net_header_len = sizeof(struct iphdr),
1785 .setsockopt = ip_setsockopt,
1786 .getsockopt = ip_getsockopt,
1787 .addr2sockaddr = inet_csk_addr2sockaddr,
1788 .sockaddr_len = sizeof(struct sockaddr_in),
ab1e0a13 1789 .bind_conflict = inet_csk_bind_conflict,
3fdadf7d 1790#ifdef CONFIG_COMPAT
543d9cfe
ACM
1791 .compat_setsockopt = compat_ip_setsockopt,
1792 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 1793#endif
4fab9071 1794 .mtu_reduced = tcp_v4_mtu_reduced,
1da177e4 1795};
4bc2f18b 1796EXPORT_SYMBOL(ipv4_specific);
1da177e4 1797
cfb6eeb4 1798#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1799static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 1800 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 1801 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 1802 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 1803};
b6332e6c 1804#endif
cfb6eeb4 1805
1da177e4
LT
1806/* NOTE: A lot of things set to zero explicitly by call to
1807 * sk_alloc() so need not be done here.
1808 */
1809static int tcp_v4_init_sock(struct sock *sk)
1810{
6687e988 1811 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 1812
900f65d3 1813 tcp_init_sock(sk);
1da177e4 1814
8292a17a 1815 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 1816
cfb6eeb4 1817#ifdef CONFIG_TCP_MD5SIG
ac807fa8 1818 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 1819#endif
1da177e4 1820
1da177e4
LT
1821 return 0;
1822}
1823
7d06b2e0 1824void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
1825{
1826 struct tcp_sock *tp = tcp_sk(sk);
1827
1828 tcp_clear_xmit_timers(sk);
1829
6687e988 1830 tcp_cleanup_congestion_control(sk);
317a76f9 1831
1da177e4 1832 /* Cleanup up the write buffer. */
fe067e8a 1833 tcp_write_queue_purge(sk);
1da177e4
LT
1834
1835 /* Cleans up our, hopefully empty, out_of_order_queue. */
e905a9ed 1836 __skb_queue_purge(&tp->out_of_order_queue);
1da177e4 1837
cfb6eeb4
YH
1838#ifdef CONFIG_TCP_MD5SIG
1839 /* Clean up the MD5 key list, if any */
1840 if (tp->md5sig_info) {
a915da9b 1841 tcp_clear_md5_list(sk);
a8afca03 1842 kfree_rcu(tp->md5sig_info, rcu);
cfb6eeb4
YH
1843 tp->md5sig_info = NULL;
1844 }
1845#endif
1a2449a8 1846
1da177e4
LT
1847 /* Clean prequeue, it must be empty really */
1848 __skb_queue_purge(&tp->ucopy.prequeue);
1849
1850 /* Clean up a referenced TCP bind bucket. */
463c84b9 1851 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 1852 inet_put_port(sk);
1da177e4 1853
168a8f58 1854 BUG_ON(tp->fastopen_rsk != NULL);
435cf559 1855
cf60af03
YC
1856 /* If socket is aborted during connect operation */
1857 tcp_free_fastopen_req(tp);
1858
180d8cd9 1859 sk_sockets_allocated_dec(sk);
d1a4c0b3 1860 sock_release_memcg(sk);
1da177e4 1861}
1da177e4
LT
1862EXPORT_SYMBOL(tcp_v4_destroy_sock);
1863
1864#ifdef CONFIG_PROC_FS
1865/* Proc filesystem TCP sock list dumping. */
1866
a8b690f9
TH
1867/*
1868 * Get next listener socket follow cur. If cur is NULL, get first socket
1869 * starting from bucket given in st->bucket; when st->bucket is zero the
1870 * very first socket in the hash table is returned.
1871 */
1da177e4
LT
1872static void *listening_get_next(struct seq_file *seq, void *cur)
1873{
463c84b9 1874 struct inet_connection_sock *icsk;
c25eb3bf 1875 struct hlist_nulls_node *node;
1da177e4 1876 struct sock *sk = cur;
5caea4ea 1877 struct inet_listen_hashbucket *ilb;
5799de0b 1878 struct tcp_iter_state *st = seq->private;
a4146b1b 1879 struct net *net = seq_file_net(seq);
1da177e4
LT
1880
1881 if (!sk) {
a8b690f9 1882 ilb = &tcp_hashinfo.listening_hash[st->bucket];
5caea4ea 1883 spin_lock_bh(&ilb->lock);
c25eb3bf 1884 sk = sk_nulls_head(&ilb->head);
a8b690f9 1885 st->offset = 0;
1da177e4
LT
1886 goto get_sk;
1887 }
5caea4ea 1888 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 1889 ++st->num;
a8b690f9 1890 ++st->offset;
1da177e4
LT
1891
1892 if (st->state == TCP_SEQ_STATE_OPENREQ) {
60236fdd 1893 struct request_sock *req = cur;
1da177e4 1894
72a3effa 1895 icsk = inet_csk(st->syn_wait_sk);
1da177e4
LT
1896 req = req->dl_next;
1897 while (1) {
1898 while (req) {
bdccc4ca 1899 if (req->rsk_ops->family == st->family) {
1da177e4
LT
1900 cur = req;
1901 goto out;
1902 }
1903 req = req->dl_next;
1904 }
72a3effa 1905 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1da177e4
LT
1906 break;
1907get_req:
463c84b9 1908 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1da177e4 1909 }
1bde5ac4 1910 sk = sk_nulls_next(st->syn_wait_sk);
1da177e4 1911 st->state = TCP_SEQ_STATE_LISTENING;
463c84b9 1912 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 1913 } else {
e905a9ed 1914 icsk = inet_csk(sk);
463c84b9
ACM
1915 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1916 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1da177e4 1917 goto start_req;
463c84b9 1918 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1bde5ac4 1919 sk = sk_nulls_next(sk);
1da177e4
LT
1920 }
1921get_sk:
c25eb3bf 1922 sk_nulls_for_each_from(sk, node) {
8475ef9f
PE
1923 if (!net_eq(sock_net(sk), net))
1924 continue;
1925 if (sk->sk_family == st->family) {
1da177e4
LT
1926 cur = sk;
1927 goto out;
1928 }
e905a9ed 1929 icsk = inet_csk(sk);
463c84b9
ACM
1930 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1931 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1da177e4
LT
1932start_req:
1933 st->uid = sock_i_uid(sk);
1934 st->syn_wait_sk = sk;
1935 st->state = TCP_SEQ_STATE_OPENREQ;
1936 st->sbucket = 0;
1937 goto get_req;
1938 }
463c84b9 1939 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 1940 }
5caea4ea 1941 spin_unlock_bh(&ilb->lock);
a8b690f9 1942 st->offset = 0;
0f7ff927 1943 if (++st->bucket < INET_LHTABLE_SIZE) {
5caea4ea
ED
1944 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1945 spin_lock_bh(&ilb->lock);
c25eb3bf 1946 sk = sk_nulls_head(&ilb->head);
1da177e4
LT
1947 goto get_sk;
1948 }
1949 cur = NULL;
1950out:
1951 return cur;
1952}
1953
1954static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1955{
a8b690f9
TH
1956 struct tcp_iter_state *st = seq->private;
1957 void *rc;
1958
1959 st->bucket = 0;
1960 st->offset = 0;
1961 rc = listening_get_next(seq, NULL);
1da177e4
LT
1962
1963 while (rc && *pos) {
1964 rc = listening_get_next(seq, rc);
1965 --*pos;
1966 }
1967 return rc;
1968}
1969
05dbc7b5 1970static inline bool empty_bucket(const struct tcp_iter_state *st)
6eac5604 1971{
05dbc7b5 1972 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
6eac5604
AK
1973}
1974
a8b690f9
TH
1975/*
1976 * Get first established socket starting from bucket given in st->bucket.
1977 * If st->bucket is zero, the very first socket in the hash is returned.
1978 */
1da177e4
LT
1979static void *established_get_first(struct seq_file *seq)
1980{
5799de0b 1981 struct tcp_iter_state *st = seq->private;
a4146b1b 1982 struct net *net = seq_file_net(seq);
1da177e4
LT
1983 void *rc = NULL;
1984
a8b690f9
TH
1985 st->offset = 0;
1986 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 1987 struct sock *sk;
3ab5aee7 1988 struct hlist_nulls_node *node;
9db66bdc 1989 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 1990
6eac5604
AK
1991 /* Lockless fast path for the common case of empty buckets */
1992 if (empty_bucket(st))
1993 continue;
1994
9db66bdc 1995 spin_lock_bh(lock);
3ab5aee7 1996 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
f40c8174 1997 if (sk->sk_family != st->family ||
878628fb 1998 !net_eq(sock_net(sk), net)) {
1da177e4
LT
1999 continue;
2000 }
2001 rc = sk;
2002 goto out;
2003 }
9db66bdc 2004 spin_unlock_bh(lock);
1da177e4
LT
2005 }
2006out:
2007 return rc;
2008}
2009
2010static void *established_get_next(struct seq_file *seq, void *cur)
2011{
2012 struct sock *sk = cur;
3ab5aee7 2013 struct hlist_nulls_node *node;
5799de0b 2014 struct tcp_iter_state *st = seq->private;
a4146b1b 2015 struct net *net = seq_file_net(seq);
1da177e4
LT
2016
2017 ++st->num;
a8b690f9 2018 ++st->offset;
1da177e4 2019
05dbc7b5 2020 sk = sk_nulls_next(sk);
1da177e4 2021
3ab5aee7 2022 sk_nulls_for_each_from(sk, node) {
878628fb 2023 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
05dbc7b5 2024 return sk;
1da177e4
LT
2025 }
2026
05dbc7b5
ED
2027 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2028 ++st->bucket;
2029 return established_get_first(seq);
1da177e4
LT
2030}
2031
2032static void *established_get_idx(struct seq_file *seq, loff_t pos)
2033{
a8b690f9
TH
2034 struct tcp_iter_state *st = seq->private;
2035 void *rc;
2036
2037 st->bucket = 0;
2038 rc = established_get_first(seq);
1da177e4
LT
2039
2040 while (rc && pos) {
2041 rc = established_get_next(seq, rc);
2042 --pos;
7174259e 2043 }
1da177e4
LT
2044 return rc;
2045}
2046
2047static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2048{
2049 void *rc;
5799de0b 2050 struct tcp_iter_state *st = seq->private;
1da177e4 2051
1da177e4
LT
2052 st->state = TCP_SEQ_STATE_LISTENING;
2053 rc = listening_get_idx(seq, &pos);
2054
2055 if (!rc) {
1da177e4
LT
2056 st->state = TCP_SEQ_STATE_ESTABLISHED;
2057 rc = established_get_idx(seq, pos);
2058 }
2059
2060 return rc;
2061}
2062
a8b690f9
TH
2063static void *tcp_seek_last_pos(struct seq_file *seq)
2064{
2065 struct tcp_iter_state *st = seq->private;
2066 int offset = st->offset;
2067 int orig_num = st->num;
2068 void *rc = NULL;
2069
2070 switch (st->state) {
2071 case TCP_SEQ_STATE_OPENREQ:
2072 case TCP_SEQ_STATE_LISTENING:
2073 if (st->bucket >= INET_LHTABLE_SIZE)
2074 break;
2075 st->state = TCP_SEQ_STATE_LISTENING;
2076 rc = listening_get_next(seq, NULL);
2077 while (offset-- && rc)
2078 rc = listening_get_next(seq, rc);
2079 if (rc)
2080 break;
2081 st->bucket = 0;
05dbc7b5 2082 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2083 /* Fallthrough */
2084 case TCP_SEQ_STATE_ESTABLISHED:
a8b690f9
TH
2085 if (st->bucket > tcp_hashinfo.ehash_mask)
2086 break;
2087 rc = established_get_first(seq);
2088 while (offset-- && rc)
2089 rc = established_get_next(seq, rc);
2090 }
2091
2092 st->num = orig_num;
2093
2094 return rc;
2095}
2096
1da177e4
LT
2097static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2098{
5799de0b 2099 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2100 void *rc;
2101
2102 if (*pos && *pos == st->last_pos) {
2103 rc = tcp_seek_last_pos(seq);
2104 if (rc)
2105 goto out;
2106 }
2107
1da177e4
LT
2108 st->state = TCP_SEQ_STATE_LISTENING;
2109 st->num = 0;
a8b690f9
TH
2110 st->bucket = 0;
2111 st->offset = 0;
2112 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2113
2114out:
2115 st->last_pos = *pos;
2116 return rc;
1da177e4
LT
2117}
2118
2119static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2120{
a8b690f9 2121 struct tcp_iter_state *st = seq->private;
1da177e4 2122 void *rc = NULL;
1da177e4
LT
2123
2124 if (v == SEQ_START_TOKEN) {
2125 rc = tcp_get_idx(seq, 0);
2126 goto out;
2127 }
1da177e4
LT
2128
2129 switch (st->state) {
2130 case TCP_SEQ_STATE_OPENREQ:
2131 case TCP_SEQ_STATE_LISTENING:
2132 rc = listening_get_next(seq, v);
2133 if (!rc) {
1da177e4 2134 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2135 st->bucket = 0;
2136 st->offset = 0;
1da177e4
LT
2137 rc = established_get_first(seq);
2138 }
2139 break;
2140 case TCP_SEQ_STATE_ESTABLISHED:
1da177e4
LT
2141 rc = established_get_next(seq, v);
2142 break;
2143 }
2144out:
2145 ++*pos;
a8b690f9 2146 st->last_pos = *pos;
1da177e4
LT
2147 return rc;
2148}
2149
2150static void tcp_seq_stop(struct seq_file *seq, void *v)
2151{
5799de0b 2152 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2153
2154 switch (st->state) {
2155 case TCP_SEQ_STATE_OPENREQ:
2156 if (v) {
463c84b9
ACM
2157 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2158 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4
LT
2159 }
2160 case TCP_SEQ_STATE_LISTENING:
2161 if (v != SEQ_START_TOKEN)
5caea4ea 2162 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4 2163 break;
1da177e4
LT
2164 case TCP_SEQ_STATE_ESTABLISHED:
2165 if (v)
9db66bdc 2166 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2167 break;
2168 }
2169}
2170
73cb88ec 2171int tcp_seq_open(struct inode *inode, struct file *file)
1da177e4 2172{
d9dda78b 2173 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
1da177e4 2174 struct tcp_iter_state *s;
52d6f3f1 2175 int err;
1da177e4 2176
52d6f3f1
DL
2177 err = seq_open_net(inode, file, &afinfo->seq_ops,
2178 sizeof(struct tcp_iter_state));
2179 if (err < 0)
2180 return err;
f40c8174 2181
52d6f3f1 2182 s = ((struct seq_file *)file->private_data)->private;
1da177e4 2183 s->family = afinfo->family;
688d1945 2184 s->last_pos = 0;
f40c8174
DL
2185 return 0;
2186}
73cb88ec 2187EXPORT_SYMBOL(tcp_seq_open);
f40c8174 2188
6f8b13bc 2189int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4
LT
2190{
2191 int rc = 0;
2192 struct proc_dir_entry *p;
2193
9427c4b3
DL
2194 afinfo->seq_ops.start = tcp_seq_start;
2195 afinfo->seq_ops.next = tcp_seq_next;
2196 afinfo->seq_ops.stop = tcp_seq_stop;
2197
84841c3c 2198 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
73cb88ec 2199 afinfo->seq_fops, afinfo);
84841c3c 2200 if (!p)
1da177e4
LT
2201 rc = -ENOMEM;
2202 return rc;
2203}
4bc2f18b 2204EXPORT_SYMBOL(tcp_proc_register);
1da177e4 2205
6f8b13bc 2206void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4 2207{
ece31ffd 2208 remove_proc_entry(afinfo->name, net->proc_net);
1da177e4 2209}
4bc2f18b 2210EXPORT_SYMBOL(tcp_proc_unregister);
1da177e4 2211
d4f06873 2212static void get_openreq4(const struct request_sock *req,
652586df 2213 struct seq_file *f, int i, kuid_t uid)
1da177e4 2214{
2e6599cb 2215 const struct inet_request_sock *ireq = inet_rsk(req);
fa76ce73 2216 long delta = req->rsk_timer.expires - jiffies;
1da177e4 2217
5e659e4c 2218 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2219 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
1da177e4 2220 i,
634fb979 2221 ireq->ir_loc_addr,
d4f06873 2222 ireq->ir_num,
634fb979
ED
2223 ireq->ir_rmt_addr,
2224 ntohs(ireq->ir_rmt_port),
1da177e4
LT
2225 TCP_SYN_RECV,
2226 0, 0, /* could print option size, but that is af dependent. */
2227 1, /* timers active (only the expire timer) */
a399a805 2228 jiffies_delta_to_clock_t(delta),
e6c022a4 2229 req->num_timeout,
a7cb5a49 2230 from_kuid_munged(seq_user_ns(f), uid),
1da177e4
LT
2231 0, /* non standard timer */
2232 0, /* open_requests have no inode */
d4f06873 2233 0,
652586df 2234 req);
1da177e4
LT
2235}
2236
652586df 2237static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
1da177e4
LT
2238{
2239 int timer_active;
2240 unsigned long timer_expires;
cf533ea5 2241 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2242 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2243 const struct inet_sock *inet = inet_sk(sk);
168a8f58 2244 struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2245 __be32 dest = inet->inet_daddr;
2246 __be32 src = inet->inet_rcv_saddr;
2247 __u16 destp = ntohs(inet->inet_dport);
2248 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2249 int rx_queue;
1da177e4 2250
6ba8a3b1
ND
2251 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2252 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2253 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2254 timer_active = 1;
463c84b9
ACM
2255 timer_expires = icsk->icsk_timeout;
2256 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2257 timer_active = 4;
463c84b9 2258 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2259 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2260 timer_active = 2;
cf4c6bf8 2261 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2262 } else {
2263 timer_active = 0;
2264 timer_expires = jiffies;
2265 }
2266
49d09007
ED
2267 if (sk->sk_state == TCP_LISTEN)
2268 rx_queue = sk->sk_ack_backlog;
2269 else
2270 /*
2271 * because we dont lock socket, we might find a transient negative value
2272 */
2273 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2274
5e659e4c 2275 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
652586df 2276 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
cf4c6bf8 2277 i, src, srcp, dest, destp, sk->sk_state,
47da8ee6 2278 tp->write_seq - tp->snd_una,
49d09007 2279 rx_queue,
1da177e4 2280 timer_active,
a399a805 2281 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2282 icsk->icsk_retransmits,
a7cb5a49 2283 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2284 icsk->icsk_probes_out,
cf4c6bf8
IJ
2285 sock_i_ino(sk),
2286 atomic_read(&sk->sk_refcnt), sk,
7be87351
SH
2287 jiffies_to_clock_t(icsk->icsk_rto),
2288 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2289 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2290 tp->snd_cwnd,
168a8f58
JC
2291 sk->sk_state == TCP_LISTEN ?
2292 (fastopenq ? fastopenq->max_qlen : 0) :
652586df 2293 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
1da177e4
LT
2294}
2295
cf533ea5 2296static void get_timewait4_sock(const struct inet_timewait_sock *tw,
652586df 2297 struct seq_file *f, int i)
1da177e4 2298{
23f33c2d 2299 __be32 dest, src;
1da177e4 2300 __u16 destp, srcp;
e2a1d3e4 2301 s32 delta = tw->tw_ttd - inet_tw_time_stamp();
1da177e4
LT
2302
2303 dest = tw->tw_daddr;
2304 src = tw->tw_rcv_saddr;
2305 destp = ntohs(tw->tw_dport);
2306 srcp = ntohs(tw->tw_sport);
2307
5e659e4c 2308 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2309 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
1da177e4 2310 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2311 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
652586df 2312 atomic_read(&tw->tw_refcnt), tw);
1da177e4
LT
2313}
2314
2315#define TMPSZ 150
2316
2317static int tcp4_seq_show(struct seq_file *seq, void *v)
2318{
5799de0b 2319 struct tcp_iter_state *st;
05dbc7b5 2320 struct sock *sk = v;
1da177e4 2321
652586df 2322 seq_setwidth(seq, TMPSZ - 1);
1da177e4 2323 if (v == SEQ_START_TOKEN) {
652586df 2324 seq_puts(seq, " sl local_address rem_address st tx_queue "
1da177e4
LT
2325 "rx_queue tr tm->when retrnsmt uid timeout "
2326 "inode");
2327 goto out;
2328 }
2329 st = seq->private;
2330
2331 switch (st->state) {
2332 case TCP_SEQ_STATE_LISTENING:
2333 case TCP_SEQ_STATE_ESTABLISHED:
05dbc7b5 2334 if (sk->sk_state == TCP_TIME_WAIT)
652586df 2335 get_timewait4_sock(v, seq, st->num);
05dbc7b5 2336 else
652586df 2337 get_tcp4_sock(v, seq, st->num);
1da177e4
LT
2338 break;
2339 case TCP_SEQ_STATE_OPENREQ:
d4f06873 2340 get_openreq4(v, seq, st->num, st->uid);
1da177e4
LT
2341 break;
2342 }
1da177e4 2343out:
652586df 2344 seq_pad(seq, '\n');
1da177e4
LT
2345 return 0;
2346}
2347
73cb88ec
AV
2348static const struct file_operations tcp_afinfo_seq_fops = {
2349 .owner = THIS_MODULE,
2350 .open = tcp_seq_open,
2351 .read = seq_read,
2352 .llseek = seq_lseek,
2353 .release = seq_release_net
2354};
2355
1da177e4 2356static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4
LT
2357 .name = "tcp",
2358 .family = AF_INET,
73cb88ec 2359 .seq_fops = &tcp_afinfo_seq_fops,
9427c4b3
DL
2360 .seq_ops = {
2361 .show = tcp4_seq_show,
2362 },
1da177e4
LT
2363};
2364
2c8c1e72 2365static int __net_init tcp4_proc_init_net(struct net *net)
757764f6
PE
2366{
2367 return tcp_proc_register(net, &tcp4_seq_afinfo);
2368}
2369
2c8c1e72 2370static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6
PE
2371{
2372 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2373}
2374
2375static struct pernet_operations tcp4_net_ops = {
2376 .init = tcp4_proc_init_net,
2377 .exit = tcp4_proc_exit_net,
2378};
2379
1da177e4
LT
2380int __init tcp4_proc_init(void)
2381{
757764f6 2382 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2383}
2384
2385void tcp4_proc_exit(void)
2386{
757764f6 2387 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2388}
2389#endif /* CONFIG_PROC_FS */
2390
2391struct proto tcp_prot = {
2392 .name = "TCP",
2393 .owner = THIS_MODULE,
2394 .close = tcp_close,
2395 .connect = tcp_v4_connect,
2396 .disconnect = tcp_disconnect,
463c84b9 2397 .accept = inet_csk_accept,
1da177e4
LT
2398 .ioctl = tcp_ioctl,
2399 .init = tcp_v4_init_sock,
2400 .destroy = tcp_v4_destroy_sock,
2401 .shutdown = tcp_shutdown,
2402 .setsockopt = tcp_setsockopt,
2403 .getsockopt = tcp_getsockopt,
1da177e4 2404 .recvmsg = tcp_recvmsg,
7ba42910
CG
2405 .sendmsg = tcp_sendmsg,
2406 .sendpage = tcp_sendpage,
1da177e4 2407 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 2408 .release_cb = tcp_release_cb,
ab1e0a13
ACM
2409 .hash = inet_hash,
2410 .unhash = inet_unhash,
2411 .get_port = inet_csk_get_port,
1da177e4 2412 .enter_memory_pressure = tcp_enter_memory_pressure,
c9bee3b7 2413 .stream_memory_free = tcp_stream_memory_free,
1da177e4 2414 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2415 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2416 .memory_allocated = &tcp_memory_allocated,
2417 .memory_pressure = &tcp_memory_pressure,
a4fe34bf 2418 .sysctl_mem = sysctl_tcp_mem,
1da177e4
LT
2419 .sysctl_wmem = sysctl_tcp_wmem,
2420 .sysctl_rmem = sysctl_tcp_rmem,
2421 .max_header = MAX_TCP_HEADER,
2422 .obj_size = sizeof(struct tcp_sock),
3ab5aee7 2423 .slab_flags = SLAB_DESTROY_BY_RCU,
6d6ee43e 2424 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2425 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2426 .h.hashinfo = &tcp_hashinfo,
7ba42910 2427 .no_autobind = true,
543d9cfe
ACM
2428#ifdef CONFIG_COMPAT
2429 .compat_setsockopt = compat_tcp_setsockopt,
2430 .compat_getsockopt = compat_tcp_getsockopt,
2431#endif
c255a458 2432#ifdef CONFIG_MEMCG_KMEM
d1a4c0b3
GC
2433 .init_cgroup = tcp_init_cgroup,
2434 .destroy_cgroup = tcp_destroy_cgroup,
2435 .proto_cgroup = tcp_proto_cgroup,
2436#endif
1da177e4 2437};
4bc2f18b 2438EXPORT_SYMBOL(tcp_prot);
1da177e4 2439
bdbbb852
ED
2440static void __net_exit tcp_sk_exit(struct net *net)
2441{
2442 int cpu;
2443
2444 for_each_possible_cpu(cpu)
2445 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2446 free_percpu(net->ipv4.tcp_sk);
2447}
2448
046ee902
DL
2449static int __net_init tcp_sk_init(struct net *net)
2450{
bdbbb852
ED
2451 int res, cpu;
2452
2453 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2454 if (!net->ipv4.tcp_sk)
2455 return -ENOMEM;
2456
2457 for_each_possible_cpu(cpu) {
2458 struct sock *sk;
2459
2460 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2461 IPPROTO_TCP, net);
2462 if (res)
2463 goto fail;
2464 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2465 }
5d134f1c 2466 net->ipv4.sysctl_tcp_ecn = 2;
b0f9ca53 2467 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
6b58e0a5 2468 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
05cbc0db 2469 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
be9f4a44 2470 return 0;
046ee902 2471
bdbbb852
ED
2472fail:
2473 tcp_sk_exit(net);
2474
2475 return res;
b099ce26
EB
2476}
2477
2478static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2479{
2480 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
046ee902
DL
2481}
2482
2483static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2484 .init = tcp_sk_init,
2485 .exit = tcp_sk_exit,
2486 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2487};
2488
9b0f976f 2489void __init tcp_v4_init(void)
1da177e4 2490{
5caea4ea 2491 inet_hashinfo_init(&tcp_hashinfo);
6a1b3054 2492 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2493 panic("Failed to create the TCP control socket.\n");
1da177e4 2494}