tcp: add tcp_mss_clamp() helper
[linux-2.6-block.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
afd46503 53#define pr_fmt(fmt) "TCP: " fmt
1da177e4 54
eb4dea58 55#include <linux/bottom_half.h>
1da177e4
LT
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
5a0e3ad6 64#include <linux/slab.h>
1da177e4 65
457c4cbc 66#include <net/net_namespace.h>
1da177e4 67#include <net/icmp.h>
304a1618 68#include <net/inet_hashtables.h>
1da177e4 69#include <net/tcp.h>
20380731 70#include <net/transp_v6.h>
1da177e4
LT
71#include <net/ipv6.h>
72#include <net/inet_common.h>
6d6ee43e 73#include <net/timewait_sock.h>
1da177e4 74#include <net/xfrm.h>
6e5714ea 75#include <net/secure_seq.h>
076bb0c8 76#include <net/busy_poll.h>
1da177e4
LT
77
78#include <linux/inet.h>
79#include <linux/ipv6.h>
80#include <linux/stddef.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83
cf80e0e4 84#include <crypto/hash.h>
cfb6eeb4
YH
85#include <linux/scatterlist.h>
86
ab32ea5d 87int sysctl_tcp_low_latency __read_mostly;
1da177e4 88
cfb6eeb4 89#ifdef CONFIG_TCP_MD5SIG
a915da9b 90static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 91 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
92#endif
93
5caea4ea 94struct inet_hashinfo tcp_hashinfo;
4bc2f18b 95EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 96
95a22cae 97static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
1da177e4 98{
eddc9ec5
ACM
99 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
100 ip_hdr(skb)->saddr,
aa8223c7 101 tcp_hdr(skb)->dest,
95a22cae 102 tcp_hdr(skb)->source, tsoff);
1da177e4
LT
103}
104
6d6ee43e
ACM
105int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
106{
107 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
108 struct tcp_sock *tp = tcp_sk(sk);
109
110 /* With PAWS, it is safe from the viewpoint
111 of data integrity. Even without PAWS it is safe provided sequence
112 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
113
114 Actually, the idea is close to VJ's one, only timestamp cache is
115 held not per host, but per port pair and TW bucket is used as state
116 holder.
117
118 If TW bucket has been already destroyed we fall back to VJ's scheme
119 and use initial timestamp retrieved from peer table.
120 */
121 if (tcptw->tw_ts_recent_stamp &&
56ab6b93 122 (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
9d729f72 123 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
6d6ee43e
ACM
124 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
125 if (tp->write_seq == 0)
126 tp->write_seq = 1;
127 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
128 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
129 sock_hold(sktw);
130 return 1;
131 }
132
133 return 0;
134}
6d6ee43e
ACM
135EXPORT_SYMBOL_GPL(tcp_twsk_unique);
136
1da177e4
LT
137/* This will initiate an outgoing connection. */
138int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
139{
2d7192d6 140 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
141 struct inet_sock *inet = inet_sk(sk);
142 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 143 __be16 orig_sport, orig_dport;
bada8adc 144 __be32 daddr, nexthop;
da905bd1 145 struct flowi4 *fl4;
2d7192d6 146 struct rtable *rt;
1da177e4 147 int err;
f6d8bd05 148 struct ip_options_rcu *inet_opt;
1946e672 149 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
1da177e4
LT
150
151 if (addr_len < sizeof(struct sockaddr_in))
152 return -EINVAL;
153
154 if (usin->sin_family != AF_INET)
155 return -EAFNOSUPPORT;
156
157 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05 158 inet_opt = rcu_dereference_protected(inet->inet_opt,
1e1d04e6 159 lockdep_sock_is_held(sk));
f6d8bd05 160 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
161 if (!daddr)
162 return -EINVAL;
f6d8bd05 163 nexthop = inet_opt->opt.faddr;
1da177e4
LT
164 }
165
dca8b089
DM
166 orig_sport = inet->inet_sport;
167 orig_dport = usin->sin_port;
da905bd1
DM
168 fl4 = &inet->cork.fl.u.ip4;
169 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
170 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
171 IPPROTO_TCP,
0e0d44ab 172 orig_sport, orig_dport, sk);
b23dd4fe
DM
173 if (IS_ERR(rt)) {
174 err = PTR_ERR(rt);
175 if (err == -ENETUNREACH)
f1d8cba6 176 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 177 return err;
584bdf8c 178 }
1da177e4
LT
179
180 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
181 ip_rt_put(rt);
182 return -ENETUNREACH;
183 }
184
f6d8bd05 185 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 186 daddr = fl4->daddr;
1da177e4 187
c720c7e8 188 if (!inet->inet_saddr)
da905bd1 189 inet->inet_saddr = fl4->saddr;
d1e559d0 190 sk_rcv_saddr_set(sk, inet->inet_saddr);
1da177e4 191
c720c7e8 192 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
193 /* Reset inherited state */
194 tp->rx_opt.ts_recent = 0;
195 tp->rx_opt.ts_recent_stamp = 0;
ee995283
PE
196 if (likely(!tp->repair))
197 tp->write_seq = 0;
1da177e4
LT
198 }
199
1946e672 200 if (tcp_death_row->sysctl_tw_recycle &&
81166dd6
DM
201 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
202 tcp_fetch_timewait_stamp(sk, &rt->dst);
1da177e4 203
c720c7e8 204 inet->inet_dport = usin->sin_port;
d1e559d0 205 sk_daddr_set(sk, daddr);
1da177e4 206
d83d8461 207 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
208 if (inet_opt)
209 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 210
bee7ca9e 211 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
212
213 /* Socket identity is still unknown (sport may be zero).
214 * However we set state to SYN-SENT and not releasing socket
215 * lock select source port, enter ourselves into the hash tables and
216 * complete initialization after this.
217 */
218 tcp_set_state(sk, TCP_SYN_SENT);
1946e672 219 err = inet_hash_connect(tcp_death_row, sk);
1da177e4
LT
220 if (err)
221 goto failure;
222
877d1f62 223 sk_set_txhash(sk);
9e7ceb06 224
da905bd1 225 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
226 inet->inet_sport, inet->inet_dport, sk);
227 if (IS_ERR(rt)) {
228 err = PTR_ERR(rt);
229 rt = NULL;
1da177e4 230 goto failure;
b23dd4fe 231 }
1da177e4 232 /* OK, now commit destination to socket. */
bcd76111 233 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 234 sk_setup_caps(sk, &rt->dst);
19f6d3f3 235 rt = NULL;
1da177e4 236
ee995283 237 if (!tp->write_seq && likely(!tp->repair))
c720c7e8
ED
238 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239 inet->inet_daddr,
240 inet->inet_sport,
95a22cae
FW
241 usin->sin_port,
242 &tp->tsoffset);
1da177e4 243
c720c7e8 244 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4 245
19f6d3f3
WW
246 if (tcp_fastopen_defer_connect(sk, &err))
247 return err;
248 if (err)
249 goto failure;
250
2b916477 251 err = tcp_connect(sk);
ee995283 252
1da177e4
LT
253 if (err)
254 goto failure;
255
256 return 0;
257
258failure:
7174259e
ACM
259 /*
260 * This unhashes the socket and releases the local port,
261 * if necessary.
262 */
1da177e4
LT
263 tcp_set_state(sk, TCP_CLOSE);
264 ip_rt_put(rt);
265 sk->sk_route_caps = 0;
c720c7e8 266 inet->inet_dport = 0;
1da177e4
LT
267 return err;
268}
4bc2f18b 269EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 270
1da177e4 271/*
563d34d0
ED
272 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
273 * It can be called through tcp_release_cb() if socket was owned by user
274 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 275 */
4fab9071 276void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4
LT
277{
278 struct dst_entry *dst;
279 struct inet_sock *inet = inet_sk(sk);
563d34d0 280 u32 mtu = tcp_sk(sk)->mtu_info;
1da177e4 281
80d0a69f
DM
282 dst = inet_csk_update_pmtu(sk, mtu);
283 if (!dst)
1da177e4
LT
284 return;
285
1da177e4
LT
286 /* Something is about to be wrong... Remember soft error
287 * for the case, if this connection will not able to recover.
288 */
289 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
290 sk->sk_err_soft = EMSGSIZE;
291
292 mtu = dst_mtu(dst);
293
294 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
482fc609 295 ip_sk_accept_pmtu(sk) &&
d83d8461 296 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
297 tcp_sync_mss(sk, mtu);
298
299 /* Resend the TCP packet because it's
300 * clear that the old packet has been
301 * dropped. This is the new "fast" path mtu
302 * discovery.
303 */
304 tcp_simple_retransmit(sk);
305 } /* else let the usual retransmit timer handle it */
306}
4fab9071 307EXPORT_SYMBOL(tcp_v4_mtu_reduced);
1da177e4 308
55be7a9c
DM
309static void do_redirect(struct sk_buff *skb, struct sock *sk)
310{
311 struct dst_entry *dst = __sk_dst_check(sk, 0);
312
1ed5c48f 313 if (dst)
6700c270 314 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
315}
316
26e37360
ED
317
318/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
9cf74903 319void tcp_req_err(struct sock *sk, u32 seq, bool abort)
26e37360
ED
320{
321 struct request_sock *req = inet_reqsk(sk);
322 struct net *net = sock_net(sk);
323
324 /* ICMPs are not backlogged, hence we cannot get
325 * an established socket here.
326 */
26e37360 327 if (seq != tcp_rsk(req)->snt_isn) {
02a1d6e7 328 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
9cf74903 329 } else if (abort) {
26e37360
ED
330 /*
331 * Still in SYN_RECV, just remove it silently.
332 * There is no good way to pass the error to the newly
333 * created socket, and POSIX does not want network
334 * errors returned from accept().
335 */
c6973669 336 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
9caad864 337 tcp_listendrop(req->rsk_listener);
26e37360 338 }
ef84d8ce 339 reqsk_put(req);
26e37360
ED
340}
341EXPORT_SYMBOL(tcp_req_err);
342
1da177e4
LT
343/*
344 * This routine is called by the ICMP module when it gets some
345 * sort of error condition. If err < 0 then the socket should
346 * be closed and the error returned to the user. If err > 0
347 * it's just the icmp type << 8 | icmp code. After adjustment
348 * header points to the first 8 bytes of the tcp header. We need
349 * to find the appropriate port.
350 *
351 * The locking strategy used here is very "optimistic". When
352 * someone else accesses the socket the ICMP is just dropped
353 * and for some paths there is no check at all.
354 * A more general error queue to queue errors for later handling
355 * is probably better.
356 *
357 */
358
4d1a2d9e 359void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 360{
b71d1d42 361 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 362 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 363 struct inet_connection_sock *icsk;
1da177e4
LT
364 struct tcp_sock *tp;
365 struct inet_sock *inet;
4d1a2d9e
DL
366 const int type = icmp_hdr(icmp_skb)->type;
367 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 368 struct sock *sk;
f1ecd5d9 369 struct sk_buff *skb;
0a672f74
YC
370 struct request_sock *fastopen;
371 __u32 seq, snd_una;
f1ecd5d9 372 __u32 remaining;
1da177e4 373 int err;
4d1a2d9e 374 struct net *net = dev_net(icmp_skb->dev);
1da177e4 375
26e37360
ED
376 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
377 th->dest, iph->saddr, ntohs(th->source),
378 inet_iif(icmp_skb));
1da177e4 379 if (!sk) {
5d3848bc 380 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
1da177e4
LT
381 return;
382 }
383 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 384 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
385 return;
386 }
26e37360
ED
387 seq = ntohl(th->seq);
388 if (sk->sk_state == TCP_NEW_SYN_RECV)
9cf74903
ED
389 return tcp_req_err(sk, seq,
390 type == ICMP_PARAMETERPROB ||
391 type == ICMP_TIME_EXCEEDED ||
392 (type == ICMP_DEST_UNREACH &&
393 (code == ICMP_NET_UNREACH ||
394 code == ICMP_HOST_UNREACH)));
1da177e4
LT
395
396 bh_lock_sock(sk);
397 /* If too many ICMPs get dropped on busy
398 * servers this needs to be solved differently.
563d34d0
ED
399 * We do take care of PMTU discovery (RFC1191) special case :
400 * we can receive locally generated ICMP messages while socket is held.
1da177e4 401 */
b74aa930
ED
402 if (sock_owned_by_user(sk)) {
403 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
02a1d6e7 404 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
b74aa930 405 }
1da177e4
LT
406 if (sk->sk_state == TCP_CLOSE)
407 goto out;
408
97e3ecd1 409 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 410 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
97e3ecd1 411 goto out;
412 }
413
f1ecd5d9 414 icsk = inet_csk(sk);
1da177e4 415 tp = tcp_sk(sk);
0a672f74
YC
416 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
417 fastopen = tp->fastopen_rsk;
418 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
1da177e4 419 if (sk->sk_state != TCP_LISTEN &&
0a672f74 420 !between(seq, snd_una, tp->snd_nxt)) {
02a1d6e7 421 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
422 goto out;
423 }
424
425 switch (type) {
55be7a9c
DM
426 case ICMP_REDIRECT:
427 do_redirect(icmp_skb, sk);
428 goto out;
1da177e4
LT
429 case ICMP_SOURCE_QUENCH:
430 /* Just silently ignore these. */
431 goto out;
432 case ICMP_PARAMETERPROB:
433 err = EPROTO;
434 break;
435 case ICMP_DEST_UNREACH:
436 if (code > NR_ICMP_UNREACH)
437 goto out;
438
439 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
440 /* We are not interested in TCP_LISTEN and open_requests
441 * (SYN-ACKs send out by Linux are always <576bytes so
442 * they should go through unfragmented).
443 */
444 if (sk->sk_state == TCP_LISTEN)
445 goto out;
446
563d34d0 447 tp->mtu_info = info;
144d56e9 448 if (!sock_owned_by_user(sk)) {
563d34d0 449 tcp_v4_mtu_reduced(sk);
144d56e9 450 } else {
7aa5470c 451 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
144d56e9
ED
452 sock_hold(sk);
453 }
1da177e4
LT
454 goto out;
455 }
456
457 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
458 /* check if icmp_skb allows revert of backoff
459 * (see draft-zimmermann-tcp-lcd) */
460 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
461 break;
462 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
0a672f74 463 !icsk->icsk_backoff || fastopen)
f1ecd5d9
DL
464 break;
465
8f49c270
DM
466 if (sock_owned_by_user(sk))
467 break;
468
f1ecd5d9 469 icsk->icsk_backoff--;
fcdd1cf4
ED
470 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
471 TCP_TIMEOUT_INIT;
472 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
f1ecd5d9
DL
473
474 skb = tcp_write_queue_head(sk);
475 BUG_ON(!skb);
476
7faee5c0
ED
477 remaining = icsk->icsk_rto -
478 min(icsk->icsk_rto,
479 tcp_time_stamp - tcp_skb_timestamp(skb));
f1ecd5d9
DL
480
481 if (remaining) {
482 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
483 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
484 } else {
485 /* RTO revert clocked out retransmission.
486 * Will retransmit now */
487 tcp_retransmit_timer(sk);
488 }
489
1da177e4
LT
490 break;
491 case ICMP_TIME_EXCEEDED:
492 err = EHOSTUNREACH;
493 break;
494 default:
495 goto out;
496 }
497
498 switch (sk->sk_state) {
1da177e4 499 case TCP_SYN_SENT:
0a672f74
YC
500 case TCP_SYN_RECV:
501 /* Only in fast or simultaneous open. If a fast open socket is
502 * is already accepted it is treated as a connected one below.
503 */
51456b29 504 if (fastopen && !fastopen->sk)
0a672f74
YC
505 break;
506
1da177e4 507 if (!sock_owned_by_user(sk)) {
1da177e4
LT
508 sk->sk_err = err;
509
510 sk->sk_error_report(sk);
511
512 tcp_done(sk);
513 } else {
514 sk->sk_err_soft = err;
515 }
516 goto out;
517 }
518
519 /* If we've already connected we will keep trying
520 * until we time out, or the user gives up.
521 *
522 * rfc1122 4.2.3.9 allows to consider as hard errors
523 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
524 * but it is obsoleted by pmtu discovery).
525 *
526 * Note, that in modern internet, where routing is unreliable
527 * and in each dark corner broken firewalls sit, sending random
528 * errors ordered by their masters even this two messages finally lose
529 * their original sense (even Linux sends invalid PORT_UNREACHs)
530 *
531 * Now we are in compliance with RFCs.
532 * --ANK (980905)
533 */
534
535 inet = inet_sk(sk);
536 if (!sock_owned_by_user(sk) && inet->recverr) {
537 sk->sk_err = err;
538 sk->sk_error_report(sk);
539 } else { /* Only an error on timeout */
540 sk->sk_err_soft = err;
541 }
542
543out:
544 bh_unlock_sock(sk);
545 sock_put(sk);
546}
547
28850dc7 548void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 549{
aa8223c7 550 struct tcphdr *th = tcp_hdr(skb);
1da177e4 551
84fa7933 552 if (skb->ip_summed == CHECKSUM_PARTIAL) {
419f9f89 553 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
663ead3b 554 skb->csum_start = skb_transport_header(skb) - skb->head;
ff1dcadb 555 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4 556 } else {
419f9f89 557 th->check = tcp_v4_check(skb->len, saddr, daddr,
07f0757a 558 csum_partial(th,
1da177e4
LT
559 th->doff << 2,
560 skb->csum));
561 }
562}
563
419f9f89 564/* This routine computes an IPv4 TCP checksum. */
bb296246 565void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 566{
cf533ea5 567 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
568
569 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
570}
4bc2f18b 571EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 572
1da177e4
LT
573/*
574 * This routine will send an RST to the other tcp.
575 *
576 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
577 * for reset.
578 * Answer: if a packet caused RST, it is not for a socket
579 * existing in our system, if it is matched to a socket,
580 * it is just duplicate segment or bug in other side's TCP.
581 * So that we build reply only basing on parameters
582 * arrived with segment.
583 * Exception: precedence violation. We do not implement it in any case.
584 */
585
a00e7444 586static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
1da177e4 587{
cf533ea5 588 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
589 struct {
590 struct tcphdr th;
591#ifdef CONFIG_TCP_MD5SIG
714e85be 592 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
593#endif
594 } rep;
1da177e4 595 struct ip_reply_arg arg;
cfb6eeb4 596#ifdef CONFIG_TCP_MD5SIG
e46787f0 597 struct tcp_md5sig_key *key = NULL;
658ddaaf
SL
598 const __u8 *hash_location = NULL;
599 unsigned char newhash[16];
600 int genhash;
601 struct sock *sk1 = NULL;
cfb6eeb4 602#endif
a86b1e30 603 struct net *net;
1da177e4
LT
604
605 /* Never send a reset in response to a reset. */
606 if (th->rst)
607 return;
608
c3658e8d
ED
609 /* If sk not NULL, it means we did a successful lookup and incoming
610 * route had to be correct. prequeue might have dropped our dst.
611 */
612 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
613 return;
614
615 /* Swap the send and the receive. */
cfb6eeb4
YH
616 memset(&rep, 0, sizeof(rep));
617 rep.th.dest = th->source;
618 rep.th.source = th->dest;
619 rep.th.doff = sizeof(struct tcphdr) / 4;
620 rep.th.rst = 1;
1da177e4
LT
621
622 if (th->ack) {
cfb6eeb4 623 rep.th.seq = th->ack_seq;
1da177e4 624 } else {
cfb6eeb4
YH
625 rep.th.ack = 1;
626 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
627 skb->len - (th->doff << 2));
1da177e4
LT
628 }
629
7174259e 630 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
631 arg.iov[0].iov_base = (unsigned char *)&rep;
632 arg.iov[0].iov_len = sizeof(rep.th);
633
0f85feae 634 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
cfb6eeb4 635#ifdef CONFIG_TCP_MD5SIG
3b24d854 636 rcu_read_lock();
658ddaaf 637 hash_location = tcp_parse_md5sig_option(th);
271c3b9b 638 if (sk && sk_fullsock(sk)) {
e46787f0
FW
639 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
640 &ip_hdr(skb)->saddr, AF_INET);
641 } else if (hash_location) {
658ddaaf
SL
642 /*
643 * active side is lost. Try to find listening socket through
644 * source port, and then find md5 key through listening socket.
645 * we are not loose security here:
646 * Incoming packet is checked with md5 hash with finding key,
647 * no RST generated if md5 hash doesn't match.
648 */
a583636a
CG
649 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
650 ip_hdr(skb)->saddr,
da5e3630 651 th->source, ip_hdr(skb)->daddr,
658ddaaf
SL
652 ntohs(th->source), inet_iif(skb));
653 /* don't send rst if it can't find key */
654 if (!sk1)
3b24d854
ED
655 goto out;
656
658ddaaf
SL
657 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
658 &ip_hdr(skb)->saddr, AF_INET);
659 if (!key)
3b24d854
ED
660 goto out;
661
658ddaaf 662
39f8e58e 663 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
658ddaaf 664 if (genhash || memcmp(hash_location, newhash, 16) != 0)
3b24d854
ED
665 goto out;
666
658ddaaf
SL
667 }
668
cfb6eeb4
YH
669 if (key) {
670 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
671 (TCPOPT_NOP << 16) |
672 (TCPOPT_MD5SIG << 8) |
673 TCPOLEN_MD5SIG);
674 /* Update length and the length the header thinks exists */
675 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
676 rep.th.doff = arg.iov[0].iov_len / 4;
677
49a72dfb 678 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
679 key, ip_hdr(skb)->saddr,
680 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
681 }
682#endif
eddc9ec5
ACM
683 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
684 ip_hdr(skb)->saddr, /* XXX */
52cd5750 685 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 686 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
271c3b9b
FW
687 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
688
e2446eaa 689 /* When socket is gone, all binding information is lost.
4c675258
AK
690 * routing might fail in this case. No choice here, if we choose to force
691 * input interface, we will misroute in case of asymmetric route.
e2446eaa 692 */
4c675258
AK
693 if (sk)
694 arg.bound_dev_if = sk->sk_bound_dev_if;
1da177e4 695
271c3b9b
FW
696 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
697 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
698
66b13d99 699 arg.tos = ip_hdr(skb)->tos;
e2d118a1 700 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
47dcc20a 701 local_bh_disable();
bdbbb852
ED
702 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
703 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d
ED
704 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
705 &arg, arg.iov[0].iov_len);
1da177e4 706
90bbcc60
ED
707 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
708 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
47dcc20a 709 local_bh_enable();
658ddaaf
SL
710
711#ifdef CONFIG_TCP_MD5SIG
3b24d854
ED
712out:
713 rcu_read_unlock();
658ddaaf 714#endif
1da177e4
LT
715}
716
717/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
718 outside socket context is ugly, certainly. What can I do?
719 */
720
e2d118a1 721static void tcp_v4_send_ack(const struct sock *sk,
e62a123b 722 struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 723 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 724 struct tcp_md5sig_key *key,
66b13d99 725 int reply_flags, u8 tos)
1da177e4 726{
cf533ea5 727 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
728 struct {
729 struct tcphdr th;
714e85be 730 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 731#ifdef CONFIG_TCP_MD5SIG
714e85be 732 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
733#endif
734 ];
1da177e4 735 } rep;
e2d118a1 736 struct net *net = sock_net(sk);
1da177e4
LT
737 struct ip_reply_arg arg;
738
739 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 740 memset(&arg, 0, sizeof(arg));
1da177e4
LT
741
742 arg.iov[0].iov_base = (unsigned char *)&rep;
743 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 744 if (tsecr) {
cfb6eeb4
YH
745 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
746 (TCPOPT_TIMESTAMP << 8) |
747 TCPOLEN_TIMESTAMP);
ee684b6f
AV
748 rep.opt[1] = htonl(tsval);
749 rep.opt[2] = htonl(tsecr);
cb48cfe8 750 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
751 }
752
753 /* Swap the send and the receive. */
754 rep.th.dest = th->source;
755 rep.th.source = th->dest;
756 rep.th.doff = arg.iov[0].iov_len / 4;
757 rep.th.seq = htonl(seq);
758 rep.th.ack_seq = htonl(ack);
759 rep.th.ack = 1;
760 rep.th.window = htons(win);
761
cfb6eeb4 762#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 763 if (key) {
ee684b6f 764 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
765
766 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
767 (TCPOPT_NOP << 16) |
768 (TCPOPT_MD5SIG << 8) |
769 TCPOLEN_MD5SIG);
770 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
771 rep.th.doff = arg.iov[0].iov_len/4;
772
49a72dfb 773 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
774 key, ip_hdr(skb)->saddr,
775 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
776 }
777#endif
88ef4a5a 778 arg.flags = reply_flags;
eddc9ec5
ACM
779 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
780 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
781 arg.iov[0].iov_len, IPPROTO_TCP, 0);
782 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
783 if (oif)
784 arg.bound_dev_if = oif;
66b13d99 785 arg.tos = tos;
e2d118a1 786 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
47dcc20a 787 local_bh_disable();
bdbbb852
ED
788 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
789 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d
ED
790 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
791 &arg, arg.iov[0].iov_len);
1da177e4 792
90bbcc60 793 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
47dcc20a 794 local_bh_enable();
1da177e4
LT
795}
796
797static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
798{
8feaf0c0 799 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 800 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 801
e2d118a1 802 tcp_v4_send_ack(sk, skb,
e62a123b 803 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 804 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
ee684b6f 805 tcp_time_stamp + tcptw->tw_ts_offset,
9501f972
YH
806 tcptw->tw_ts_recent,
807 tw->tw_bound_dev_if,
88ef4a5a 808 tcp_twsk_md5_key(tcptw),
66b13d99
ED
809 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
810 tw->tw_tos
9501f972 811 );
1da177e4 812
8feaf0c0 813 inet_twsk_put(tw);
1da177e4
LT
814}
815
a00e7444 816static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
7174259e 817 struct request_sock *req)
1da177e4 818{
168a8f58
JC
819 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
820 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
821 */
e62a123b
ED
822 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
823 tcp_sk(sk)->snd_nxt;
824
20a2b49f
ED
825 /* RFC 7323 2.3
826 * The window field (SEG.WND) of every outgoing segment, with the
827 * exception of <SYN> segments, MUST be right-shifted by
828 * Rcv.Wind.Shift bits:
829 */
e2d118a1 830 tcp_v4_send_ack(sk, skb, seq,
20a2b49f
ED
831 tcp_rsk(req)->rcv_nxt,
832 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
95a22cae 833 tcp_time_stamp + tcp_rsk(req)->ts_off,
9501f972
YH
834 req->ts_recent,
835 0,
a915da9b
ED
836 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
837 AF_INET),
66b13d99
ED
838 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
839 ip_hdr(skb)->tos);
1da177e4
LT
840}
841
1da177e4 842/*
9bf1d83e 843 * Send a SYN-ACK after having received a SYN.
60236fdd 844 * This still operates on a request_sock only, not on a big
1da177e4
LT
845 * socket.
846 */
0f935dbe 847static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
d6274bd8 848 struct flowi *fl,
72659ecc 849 struct request_sock *req,
ca6fb065 850 struct tcp_fastopen_cookie *foc,
b3d05147 851 enum tcp_synack_type synack_type)
1da177e4 852{
2e6599cb 853 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 854 struct flowi4 fl4;
1da177e4 855 int err = -1;
d41db5af 856 struct sk_buff *skb;
1da177e4
LT
857
858 /* First, grab a route. */
ba3f7f04 859 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 860 return -1;
1da177e4 861
b3d05147 862 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
1da177e4
LT
863
864 if (skb) {
634fb979 865 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1da177e4 866
634fb979
ED
867 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
868 ireq->ir_rmt_addr,
2e6599cb 869 ireq->opt);
b9df3cb8 870 err = net_xmit_eval(err);
1da177e4
LT
871 }
872
1da177e4
LT
873 return err;
874}
875
876/*
60236fdd 877 * IPv4 request_sock destructor.
1da177e4 878 */
60236fdd 879static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 880{
a51482bd 881 kfree(inet_rsk(req)->opt);
1da177e4
LT
882}
883
cfb6eeb4
YH
884#ifdef CONFIG_TCP_MD5SIG
885/*
886 * RFC2385 MD5 checksumming requires a mapping of
887 * IP address->MD5 Key.
888 * We need to maintain these in the sk structure.
889 */
890
891/* Find the Key structure for an address. */
b83e3deb 892struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
a915da9b
ED
893 const union tcp_md5_addr *addr,
894 int family)
cfb6eeb4 895{
fd3a154a 896 const struct tcp_sock *tp = tcp_sk(sk);
a915da9b 897 struct tcp_md5sig_key *key;
a915da9b 898 unsigned int size = sizeof(struct in_addr);
fd3a154a 899 const struct tcp_md5sig_info *md5sig;
cfb6eeb4 900
a8afca03
ED
901 /* caller either holds rcu_read_lock() or socket lock */
902 md5sig = rcu_dereference_check(tp->md5sig_info,
1e1d04e6 903 lockdep_sock_is_held(sk));
a8afca03 904 if (!md5sig)
cfb6eeb4 905 return NULL;
a915da9b
ED
906#if IS_ENABLED(CONFIG_IPV6)
907 if (family == AF_INET6)
908 size = sizeof(struct in6_addr);
909#endif
b67bfe0d 910 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
a915da9b
ED
911 if (key->family != family)
912 continue;
913 if (!memcmp(&key->addr, addr, size))
914 return key;
cfb6eeb4
YH
915 }
916 return NULL;
917}
a915da9b 918EXPORT_SYMBOL(tcp_md5_do_lookup);
cfb6eeb4 919
b83e3deb 920struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
fd3a154a 921 const struct sock *addr_sk)
cfb6eeb4 922{
b52e6921 923 const union tcp_md5_addr *addr;
a915da9b 924
b52e6921 925 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
a915da9b 926 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4 927}
cfb6eeb4
YH
928EXPORT_SYMBOL(tcp_v4_md5_lookup);
929
cfb6eeb4 930/* This can be called on a newly created socket, from other files */
a915da9b
ED
931int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
932 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
cfb6eeb4
YH
933{
934 /* Add Key to the list */
b0a713e9 935 struct tcp_md5sig_key *key;
cfb6eeb4 936 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 937 struct tcp_md5sig_info *md5sig;
cfb6eeb4 938
c0353c7b 939 key = tcp_md5_do_lookup(sk, addr, family);
cfb6eeb4
YH
940 if (key) {
941 /* Pre-existing entry - just update that one. */
a915da9b 942 memcpy(key->key, newkey, newkeylen);
b0a713e9 943 key->keylen = newkeylen;
a915da9b
ED
944 return 0;
945 }
260fcbeb 946
a8afca03 947 md5sig = rcu_dereference_protected(tp->md5sig_info,
1e1d04e6 948 lockdep_sock_is_held(sk));
a915da9b
ED
949 if (!md5sig) {
950 md5sig = kmalloc(sizeof(*md5sig), gfp);
951 if (!md5sig)
cfb6eeb4 952 return -ENOMEM;
cfb6eeb4 953
a915da9b
ED
954 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
955 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 956 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 957 }
cfb6eeb4 958
5f3d9cb2 959 key = sock_kmalloc(sk, sizeof(*key), gfp);
a915da9b
ED
960 if (!key)
961 return -ENOMEM;
71cea17e 962 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 963 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 964 return -ENOMEM;
cfb6eeb4 965 }
a915da9b
ED
966
967 memcpy(key->key, newkey, newkeylen);
968 key->keylen = newkeylen;
969 key->family = family;
970 memcpy(&key->addr, addr,
971 (family == AF_INET6) ? sizeof(struct in6_addr) :
972 sizeof(struct in_addr));
973 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
974 return 0;
975}
a915da9b 976EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 977
a915da9b 978int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
cfb6eeb4 979{
a915da9b
ED
980 struct tcp_md5sig_key *key;
981
c0353c7b 982 key = tcp_md5_do_lookup(sk, addr, family);
a915da9b
ED
983 if (!key)
984 return -ENOENT;
985 hlist_del_rcu(&key->node);
5f3d9cb2 986 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 987 kfree_rcu(key, rcu);
a915da9b 988 return 0;
cfb6eeb4 989}
a915da9b 990EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 991
e0683e70 992static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
993{
994 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 995 struct tcp_md5sig_key *key;
b67bfe0d 996 struct hlist_node *n;
a8afca03 997 struct tcp_md5sig_info *md5sig;
cfb6eeb4 998
a8afca03
ED
999 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1000
b67bfe0d 1001 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1002 hlist_del_rcu(&key->node);
5f3d9cb2 1003 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1004 kfree_rcu(key, rcu);
cfb6eeb4
YH
1005 }
1006}
1007
7174259e
ACM
1008static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1009 int optlen)
cfb6eeb4
YH
1010{
1011 struct tcp_md5sig cmd;
1012 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
cfb6eeb4
YH
1013
1014 if (optlen < sizeof(cmd))
1015 return -EINVAL;
1016
7174259e 1017 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1018 return -EFAULT;
1019
1020 if (sin->sin_family != AF_INET)
1021 return -EINVAL;
1022
64a124ed 1023 if (!cmd.tcpm_keylen)
a915da9b
ED
1024 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1025 AF_INET);
cfb6eeb4
YH
1026
1027 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1028 return -EINVAL;
1029
a915da9b
ED
1030 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1031 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1032 GFP_KERNEL);
cfb6eeb4
YH
1033}
1034
19689e38
ED
1035static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1036 __be32 daddr, __be32 saddr,
1037 const struct tcphdr *th, int nbytes)
cfb6eeb4 1038{
cfb6eeb4 1039 struct tcp4_pseudohdr *bp;
49a72dfb 1040 struct scatterlist sg;
19689e38 1041 struct tcphdr *_th;
cfb6eeb4 1042
19689e38 1043 bp = hp->scratch;
cfb6eeb4
YH
1044 bp->saddr = saddr;
1045 bp->daddr = daddr;
1046 bp->pad = 0;
076fb722 1047 bp->protocol = IPPROTO_TCP;
49a72dfb 1048 bp->len = cpu_to_be16(nbytes);
c7da57a1 1049
19689e38
ED
1050 _th = (struct tcphdr *)(bp + 1);
1051 memcpy(_th, th, sizeof(*th));
1052 _th->check = 0;
1053
1054 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1055 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1056 sizeof(*bp) + sizeof(*th));
cf80e0e4 1057 return crypto_ahash_update(hp->md5_req);
49a72dfb
AL
1058}
1059
a915da9b 1060static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1061 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1062{
1063 struct tcp_md5sig_pool *hp;
cf80e0e4 1064 struct ahash_request *req;
49a72dfb
AL
1065
1066 hp = tcp_get_md5sig_pool();
1067 if (!hp)
1068 goto clear_hash_noput;
cf80e0e4 1069 req = hp->md5_req;
49a72dfb 1070
cf80e0e4 1071 if (crypto_ahash_init(req))
49a72dfb 1072 goto clear_hash;
19689e38 1073 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
49a72dfb
AL
1074 goto clear_hash;
1075 if (tcp_md5_hash_key(hp, key))
1076 goto clear_hash;
cf80e0e4
HX
1077 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1078 if (crypto_ahash_final(req))
cfb6eeb4
YH
1079 goto clear_hash;
1080
cfb6eeb4 1081 tcp_put_md5sig_pool();
cfb6eeb4 1082 return 0;
49a72dfb 1083
cfb6eeb4
YH
1084clear_hash:
1085 tcp_put_md5sig_pool();
1086clear_hash_noput:
1087 memset(md5_hash, 0, 16);
49a72dfb 1088 return 1;
cfb6eeb4
YH
1089}
1090
39f8e58e
ED
1091int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1092 const struct sock *sk,
318cf7aa 1093 const struct sk_buff *skb)
cfb6eeb4 1094{
49a72dfb 1095 struct tcp_md5sig_pool *hp;
cf80e0e4 1096 struct ahash_request *req;
318cf7aa 1097 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1098 __be32 saddr, daddr;
1099
39f8e58e
ED
1100 if (sk) { /* valid for establish/request sockets */
1101 saddr = sk->sk_rcv_saddr;
1102 daddr = sk->sk_daddr;
cfb6eeb4 1103 } else {
49a72dfb
AL
1104 const struct iphdr *iph = ip_hdr(skb);
1105 saddr = iph->saddr;
1106 daddr = iph->daddr;
cfb6eeb4 1107 }
49a72dfb
AL
1108
1109 hp = tcp_get_md5sig_pool();
1110 if (!hp)
1111 goto clear_hash_noput;
cf80e0e4 1112 req = hp->md5_req;
49a72dfb 1113
cf80e0e4 1114 if (crypto_ahash_init(req))
49a72dfb
AL
1115 goto clear_hash;
1116
19689e38 1117 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
49a72dfb
AL
1118 goto clear_hash;
1119 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1120 goto clear_hash;
1121 if (tcp_md5_hash_key(hp, key))
1122 goto clear_hash;
cf80e0e4
HX
1123 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1124 if (crypto_ahash_final(req))
49a72dfb
AL
1125 goto clear_hash;
1126
1127 tcp_put_md5sig_pool();
1128 return 0;
1129
1130clear_hash:
1131 tcp_put_md5sig_pool();
1132clear_hash_noput:
1133 memset(md5_hash, 0, 16);
1134 return 1;
cfb6eeb4 1135}
49a72dfb 1136EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1137
ba8e275a
ED
1138#endif
1139
ff74e23f 1140/* Called with rcu_read_lock() */
ba8e275a 1141static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
ff74e23f 1142 const struct sk_buff *skb)
cfb6eeb4 1143{
ba8e275a 1144#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4
YH
1145 /*
1146 * This gets called for each TCP segment that arrives
1147 * so we want to be efficient.
1148 * We have 3 drop cases:
1149 * o No MD5 hash and one expected.
1150 * o MD5 hash and we're not expecting one.
1151 * o MD5 hash and its wrong.
1152 */
cf533ea5 1153 const __u8 *hash_location = NULL;
cfb6eeb4 1154 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1155 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1156 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1157 int genhash;
cfb6eeb4
YH
1158 unsigned char newhash[16];
1159
a915da9b
ED
1160 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1161 AF_INET);
7d5d5525 1162 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1163
cfb6eeb4
YH
1164 /* We've parsed the options - do we have a hash? */
1165 if (!hash_expected && !hash_location)
a2a385d6 1166 return false;
cfb6eeb4
YH
1167
1168 if (hash_expected && !hash_location) {
c10d9310 1169 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1170 return true;
cfb6eeb4
YH
1171 }
1172
1173 if (!hash_expected && hash_location) {
c10d9310 1174 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1175 return true;
cfb6eeb4
YH
1176 }
1177
1178 /* Okay, so this is hash_expected and hash_location -
1179 * so we need to calculate the checksum.
1180 */
49a72dfb
AL
1181 genhash = tcp_v4_md5_hash_skb(newhash,
1182 hash_expected,
39f8e58e 1183 NULL, skb);
cfb6eeb4
YH
1184
1185 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
72145a68 1186 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
e87cc472
JP
1187 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1188 &iph->saddr, ntohs(th->source),
1189 &iph->daddr, ntohs(th->dest),
1190 genhash ? " tcp_v4_calc_md5_hash failed"
1191 : "");
a2a385d6 1192 return true;
cfb6eeb4 1193 }
a2a385d6 1194 return false;
cfb6eeb4 1195#endif
ba8e275a
ED
1196 return false;
1197}
cfb6eeb4 1198
b40cf18e
ED
1199static void tcp_v4_init_req(struct request_sock *req,
1200 const struct sock *sk_listener,
16bea70a
OP
1201 struct sk_buff *skb)
1202{
1203 struct inet_request_sock *ireq = inet_rsk(req);
1204
08d2cc3b
ED
1205 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1206 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
16bea70a
OP
1207 ireq->opt = tcp_v4_save_options(skb);
1208}
1209
f964629e
ED
1210static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1211 struct flowi *fl,
d94e0417
OP
1212 const struct request_sock *req,
1213 bool *strict)
1214{
1215 struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1216
1217 if (strict) {
1218 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1219 *strict = true;
1220 else
1221 *strict = false;
1222 }
1223
1224 return dst;
1225}
1226
72a3effa 1227struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1228 .family = PF_INET,
2e6599cb 1229 .obj_size = sizeof(struct tcp_request_sock),
5db92c99 1230 .rtx_syn_ack = tcp_rtx_synack,
60236fdd
ACM
1231 .send_ack = tcp_v4_reqsk_send_ack,
1232 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1233 .send_reset = tcp_v4_send_reset,
688d1945 1234 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1235};
1236
b2e4b3de 1237static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
2aec4a29 1238 .mss_clamp = TCP_MSS_DEFAULT,
16bea70a 1239#ifdef CONFIG_TCP_MD5SIG
fd3a154a 1240 .req_md5_lookup = tcp_v4_md5_lookup,
e3afe7b7 1241 .calc_md5_hash = tcp_v4_md5_hash_skb,
b6332e6c 1242#endif
16bea70a 1243 .init_req = tcp_v4_init_req,
fb7b37a7
OP
1244#ifdef CONFIG_SYN_COOKIES
1245 .cookie_init_seq = cookie_v4_init_sequence,
1246#endif
d94e0417 1247 .route_req = tcp_v4_route_req,
936b8bdb 1248 .init_seq = tcp_v4_init_sequence,
d6274bd8 1249 .send_synack = tcp_v4_send_synack,
16bea70a 1250};
cfb6eeb4 1251
1da177e4
LT
1252int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1253{
1da177e4 1254 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1255 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1256 goto drop;
1257
1fb6f159
OP
1258 return tcp_conn_request(&tcp_request_sock_ops,
1259 &tcp_request_sock_ipv4_ops, sk, skb);
1da177e4 1260
1da177e4 1261drop:
9caad864 1262 tcp_listendrop(sk);
1da177e4
LT
1263 return 0;
1264}
4bc2f18b 1265EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1266
1267
1268/*
1269 * The three way handshake has completed - we got a valid synack -
1270 * now create the new socket.
1271 */
0c27171e 1272struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
60236fdd 1273 struct request_sock *req,
5e0724d0
ED
1274 struct dst_entry *dst,
1275 struct request_sock *req_unhash,
1276 bool *own_req)
1da177e4 1277{
2e6599cb 1278 struct inet_request_sock *ireq;
1da177e4
LT
1279 struct inet_sock *newinet;
1280 struct tcp_sock *newtp;
1281 struct sock *newsk;
cfb6eeb4
YH
1282#ifdef CONFIG_TCP_MD5SIG
1283 struct tcp_md5sig_key *key;
1284#endif
f6d8bd05 1285 struct ip_options_rcu *inet_opt;
1da177e4
LT
1286
1287 if (sk_acceptq_is_full(sk))
1288 goto exit_overflow;
1289
1da177e4
LT
1290 newsk = tcp_create_openreq_child(sk, req, skb);
1291 if (!newsk)
093d2823 1292 goto exit_nonewsk;
1da177e4 1293
bcd76111 1294 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1295 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1296
1297 newtp = tcp_sk(newsk);
1298 newinet = inet_sk(newsk);
2e6599cb 1299 ireq = inet_rsk(req);
d1e559d0
ED
1300 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1301 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
6dd9a14e 1302 newsk->sk_bound_dev_if = ireq->ir_iif;
634fb979 1303 newinet->inet_saddr = ireq->ir_loc_addr;
f6d8bd05
ED
1304 inet_opt = ireq->opt;
1305 rcu_assign_pointer(newinet->inet_opt, inet_opt);
2e6599cb 1306 ireq->opt = NULL;
463c84b9 1307 newinet->mc_index = inet_iif(skb);
eddc9ec5 1308 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1309 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1310 inet_csk(newsk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
1311 if (inet_opt)
1312 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
c720c7e8 1313 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1314
dfd25fff
ED
1315 if (!dst) {
1316 dst = inet_csk_route_child_sock(sk, newsk, req);
1317 if (!dst)
1318 goto put_and_exit;
1319 } else {
1320 /* syncookie case : see end of cookie_v4_check() */
1321 }
0e734419
DM
1322 sk_setup_caps(newsk, dst);
1323
81164413
DB
1324 tcp_ca_openreq_child(newsk, dst);
1325
1da177e4 1326 tcp_sync_mss(newsk, dst_mtu(dst));
3541f9e8 1327 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
f5fff5dc 1328
1da177e4
LT
1329 tcp_initialize_rcv_mss(newsk);
1330
cfb6eeb4
YH
1331#ifdef CONFIG_TCP_MD5SIG
1332 /* Copy over the MD5 key from the original socket */
a915da9b
ED
1333 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1334 AF_INET);
00db4124 1335 if (key) {
cfb6eeb4
YH
1336 /*
1337 * We're using one, so create a matching key
1338 * on the newsk structure. If we fail to get
1339 * memory, then we end up not copying the key
1340 * across. Shucks.
1341 */
a915da9b
ED
1342 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1343 AF_INET, key->key, key->keylen, GFP_ATOMIC);
a465419b 1344 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1345 }
1346#endif
1347
0e734419
DM
1348 if (__inet_inherit_port(sk, newsk) < 0)
1349 goto put_and_exit;
5e0724d0 1350 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
805c4bc0 1351 if (*own_req)
49a496c9 1352 tcp_move_syn(newtp, req);
1da177e4
LT
1353
1354 return newsk;
1355
1356exit_overflow:
c10d9310 1357 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1358exit_nonewsk:
1359 dst_release(dst);
1da177e4 1360exit:
9caad864 1361 tcp_listendrop(sk);
1da177e4 1362 return NULL;
0e734419 1363put_and_exit:
e337e24d
CP
1364 inet_csk_prepare_forced_close(newsk);
1365 tcp_done(newsk);
0e734419 1366 goto exit;
1da177e4 1367}
4bc2f18b 1368EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4 1369
079096f1 1370static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1da177e4 1371{
079096f1 1372#ifdef CONFIG_SYN_COOKIES
52452c54 1373 const struct tcphdr *th = tcp_hdr(skb);
1da177e4 1374
af9b4738 1375 if (!th->syn)
461b74c3 1376 sk = cookie_v4_check(sk, skb);
1da177e4
LT
1377#endif
1378 return sk;
1379}
1380
1da177e4 1381/* The socket must have it's spinlock held when we get
e994b2f0 1382 * here, unless it is a TCP_LISTEN socket.
1da177e4
LT
1383 *
1384 * We have a potential double-lock case here, so even when
1385 * doing backlog processing we use the BH locking scheme.
1386 * This is because we cannot sleep with the original spinlock
1387 * held.
1388 */
1389int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1390{
cfb6eeb4 1391 struct sock *rsk;
cfb6eeb4 1392
1da177e4 1393 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1394 struct dst_entry *dst = sk->sk_rx_dst;
1395
bdeab991 1396 sock_rps_save_rxhash(sk, skb);
3d97379a 1397 sk_mark_napi_id(sk, skb);
404e0a8b 1398 if (dst) {
505fbcf0 1399 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
51456b29 1400 !dst->ops->check(dst, 0)) {
92101b3b
DM
1401 dst_release(dst);
1402 sk->sk_rx_dst = NULL;
1403 }
1404 }
c995ae22 1405 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1da177e4
LT
1406 return 0;
1407 }
1408
12e25e10 1409 if (tcp_checksum_complete(skb))
1da177e4
LT
1410 goto csum_err;
1411
1412 if (sk->sk_state == TCP_LISTEN) {
079096f1
ED
1413 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1414
1da177e4
LT
1415 if (!nsk)
1416 goto discard;
1da177e4 1417 if (nsk != sk) {
bdeab991 1418 sock_rps_save_rxhash(nsk, skb);
38cb5245 1419 sk_mark_napi_id(nsk, skb);
cfb6eeb4
YH
1420 if (tcp_child_process(sk, nsk, skb)) {
1421 rsk = nsk;
1da177e4 1422 goto reset;
cfb6eeb4 1423 }
1da177e4
LT
1424 return 0;
1425 }
ca55158c 1426 } else
bdeab991 1427 sock_rps_save_rxhash(sk, skb);
ca55158c 1428
72ab4a86 1429 if (tcp_rcv_state_process(sk, skb)) {
cfb6eeb4 1430 rsk = sk;
1da177e4 1431 goto reset;
cfb6eeb4 1432 }
1da177e4
LT
1433 return 0;
1434
1435reset:
cfb6eeb4 1436 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1437discard:
1438 kfree_skb(skb);
1439 /* Be careful here. If this function gets more complicated and
1440 * gcc suffers from register pressure on the x86, sk (in %ebx)
1441 * might be destroyed here. This current version compiles correctly,
1442 * but you have been warned.
1443 */
1444 return 0;
1445
1446csum_err:
c10d9310
ED
1447 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1448 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1449 goto discard;
1450}
4bc2f18b 1451EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1452
160eb5a6 1453void tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1454{
41063e9d
DM
1455 const struct iphdr *iph;
1456 const struct tcphdr *th;
1457 struct sock *sk;
41063e9d 1458
41063e9d 1459 if (skb->pkt_type != PACKET_HOST)
160eb5a6 1460 return;
41063e9d 1461
45f00f99 1462 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
160eb5a6 1463 return;
41063e9d
DM
1464
1465 iph = ip_hdr(skb);
45f00f99 1466 th = tcp_hdr(skb);
41063e9d
DM
1467
1468 if (th->doff < sizeof(struct tcphdr) / 4)
160eb5a6 1469 return;
41063e9d 1470
45f00f99 1471 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1472 iph->saddr, th->source,
7011d085 1473 iph->daddr, ntohs(th->dest),
9cb429d6 1474 skb->skb_iif);
41063e9d
DM
1475 if (sk) {
1476 skb->sk = sk;
1477 skb->destructor = sock_edemux;
f7e4eb03 1478 if (sk_fullsock(sk)) {
d0c294c5 1479 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
505fbcf0 1480
41063e9d
DM
1481 if (dst)
1482 dst = dst_check(dst, 0);
92101b3b 1483 if (dst &&
505fbcf0 1484 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1485 skb_dst_set_noref(skb, dst);
41063e9d
DM
1486 }
1487 }
41063e9d
DM
1488}
1489
b2fb4f54
ED
1490/* Packet is added to VJ-style prequeue for processing in process
1491 * context, if a reader task is waiting. Apparently, this exciting
1492 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1493 * failed somewhere. Latency? Burstiness? Well, at least now we will
1494 * see, why it failed. 8)8) --ANK
1495 *
1496 */
1497bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1498{
1499 struct tcp_sock *tp = tcp_sk(sk);
1500
1501 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1502 return false;
1503
1504 if (skb->len <= tcp_hdrlen(skb) &&
1505 skb_queue_len(&tp->ucopy.prequeue) == 0)
1506 return false;
1507
ca777eff
ED
1508 /* Before escaping RCU protected region, we need to take care of skb
1509 * dst. Prequeue is only enabled for established sockets.
1510 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1511 * Instead of doing full sk_rx_dst validity here, let's perform
1512 * an optimistic check.
1513 */
1514 if (likely(sk->sk_rx_dst))
1515 skb_dst_drop(skb);
1516 else
5037e9ef 1517 skb_dst_force_safe(skb);
ca777eff 1518
b2fb4f54
ED
1519 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1520 tp->ucopy.memory += skb->truesize;
0cef6a4c
ED
1521 if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1522 tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
b2fb4f54
ED
1523 struct sk_buff *skb1;
1524
1525 BUG_ON(sock_owned_by_user(sk));
0cef6a4c
ED
1526 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1527 skb_queue_len(&tp->ucopy.prequeue));
b2fb4f54 1528
0cef6a4c 1529 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
b2fb4f54 1530 sk_backlog_rcv(sk, skb1);
b2fb4f54
ED
1531
1532 tp->ucopy.memory = 0;
1533 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1534 wake_up_interruptible_sync_poll(sk_sleep(sk),
1535 POLLIN | POLLRDNORM | POLLRDBAND);
1536 if (!inet_csk_ack_scheduled(sk))
1537 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1538 (3 * tcp_rto_min(sk)) / 4,
1539 TCP_RTO_MAX);
1540 }
1541 return true;
1542}
1543EXPORT_SYMBOL(tcp_prequeue);
1544
c9c33212
ED
1545bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1546{
1547 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1548
1549 /* Only socket owner can try to collapse/prune rx queues
1550 * to reduce memory overhead, so add a little headroom here.
1551 * Few sockets backlog are possibly concurrently non empty.
1552 */
1553 limit += 64*1024;
1554
1555 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1556 * we can fix skb->truesize to its real value to avoid future drops.
1557 * This is valid because skb is not yet charged to the socket.
1558 * It has been noticed pure SACK packets were sometimes dropped
1559 * (if cooked by drivers without copybreak feature).
1560 */
60b1af33 1561 skb_condense(skb);
c9c33212
ED
1562
1563 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1564 bh_unlock_sock(sk);
1565 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1566 return true;
1567 }
1568 return false;
1569}
1570EXPORT_SYMBOL(tcp_add_backlog);
1571
ac6e7800
ED
1572int tcp_filter(struct sock *sk, struct sk_buff *skb)
1573{
1574 struct tcphdr *th = (struct tcphdr *)skb->data;
1575 unsigned int eaten = skb->len;
1576 int err;
1577
1578 err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1579 if (!err) {
1580 eaten -= skb->len;
1581 TCP_SKB_CB(skb)->end_seq -= eaten;
1582 }
1583 return err;
1584}
1585EXPORT_SYMBOL(tcp_filter);
1586
1da177e4
LT
1587/*
1588 * From tcp_input.c
1589 */
1590
1591int tcp_v4_rcv(struct sk_buff *skb)
1592{
3b24d854 1593 struct net *net = dev_net(skb->dev);
eddc9ec5 1594 const struct iphdr *iph;
cf533ea5 1595 const struct tcphdr *th;
3b24d854 1596 bool refcounted;
1da177e4
LT
1597 struct sock *sk;
1598 int ret;
1599
1600 if (skb->pkt_type != PACKET_HOST)
1601 goto discard_it;
1602
1603 /* Count it even if it's bad */
90bbcc60 1604 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1da177e4
LT
1605
1606 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1607 goto discard_it;
1608
ea1627c2 1609 th = (const struct tcphdr *)skb->data;
1da177e4 1610
ea1627c2 1611 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1da177e4
LT
1612 goto bad_packet;
1613 if (!pskb_may_pull(skb, th->doff * 4))
1614 goto discard_it;
1615
1616 /* An explanation is required here, I think.
1617 * Packet length and doff are validated by header prediction,
caa20d9a 1618 * provided case of th->doff==0 is eliminated.
1da177e4 1619 * So, we defer the checks. */
ed70fcfc
TH
1620
1621 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
6a5dc9e5 1622 goto csum_error;
1da177e4 1623
ea1627c2 1624 th = (const struct tcphdr *)skb->data;
eddc9ec5 1625 iph = ip_hdr(skb);
971f10ec
ED
1626 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1627 * barrier() makes sure compiler wont play fool^Waliasing games.
1628 */
1629 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1630 sizeof(struct inet_skb_parm));
1631 barrier();
1632
1da177e4
LT
1633 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1634 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1635 skb->len - th->doff * 4);
1636 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
e11ecddf 1637 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
04317daf 1638 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
b82d1bb4 1639 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1da177e4
LT
1640 TCP_SKB_CB(skb)->sacked = 0;
1641
4bdc3d66 1642lookup:
a583636a 1643 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
3b24d854 1644 th->dest, &refcounted);
1da177e4
LT
1645 if (!sk)
1646 goto no_tcp_socket;
1647
bb134d5d
ED
1648process:
1649 if (sk->sk_state == TCP_TIME_WAIT)
1650 goto do_time_wait;
1651
079096f1
ED
1652 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1653 struct request_sock *req = inet_reqsk(sk);
7716682c 1654 struct sock *nsk;
079096f1
ED
1655
1656 sk = req->rsk_listener;
72923555 1657 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
e65c332d 1658 sk_drops_add(sk, skb);
72923555
ED
1659 reqsk_put(req);
1660 goto discard_it;
1661 }
7716682c 1662 if (unlikely(sk->sk_state != TCP_LISTEN)) {
f03f2e15 1663 inet_csk_reqsk_queue_drop_and_put(sk, req);
4bdc3d66
ED
1664 goto lookup;
1665 }
3b24d854
ED
1666 /* We own a reference on the listener, increase it again
1667 * as we might lose it too soon.
1668 */
7716682c 1669 sock_hold(sk);
3b24d854 1670 refcounted = true;
7716682c 1671 nsk = tcp_check_req(sk, skb, req, false);
079096f1
ED
1672 if (!nsk) {
1673 reqsk_put(req);
7716682c 1674 goto discard_and_relse;
079096f1
ED
1675 }
1676 if (nsk == sk) {
079096f1
ED
1677 reqsk_put(req);
1678 } else if (tcp_child_process(sk, nsk, skb)) {
1679 tcp_v4_send_reset(nsk, skb);
7716682c 1680 goto discard_and_relse;
079096f1 1681 } else {
7716682c 1682 sock_put(sk);
079096f1
ED
1683 return 0;
1684 }
1685 }
6cce09f8 1686 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 1687 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1688 goto discard_and_relse;
6cce09f8 1689 }
d218d111 1690
1da177e4
LT
1691 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1692 goto discard_and_relse;
9ea88a15 1693
9ea88a15
DP
1694 if (tcp_v4_inbound_md5_hash(sk, skb))
1695 goto discard_and_relse;
9ea88a15 1696
b59c2701 1697 nf_reset(skb);
1da177e4 1698
ac6e7800 1699 if (tcp_filter(sk, skb))
1da177e4 1700 goto discard_and_relse;
ac6e7800
ED
1701 th = (const struct tcphdr *)skb->data;
1702 iph = ip_hdr(skb);
1da177e4
LT
1703
1704 skb->dev = NULL;
1705
e994b2f0
ED
1706 if (sk->sk_state == TCP_LISTEN) {
1707 ret = tcp_v4_do_rcv(sk, skb);
1708 goto put_and_return;
1709 }
1710
1711 sk_incoming_cpu_update(sk);
1712
c6366184 1713 bh_lock_sock_nested(sk);
a44d6eac 1714 tcp_segs_in(tcp_sk(sk), skb);
1da177e4
LT
1715 ret = 0;
1716 if (!sock_owned_by_user(sk)) {
7bced397 1717 if (!tcp_prequeue(sk, skb))
1da177e4 1718 ret = tcp_v4_do_rcv(sk, skb);
c9c33212 1719 } else if (tcp_add_backlog(sk, skb)) {
6b03a53a
ZY
1720 goto discard_and_relse;
1721 }
1da177e4
LT
1722 bh_unlock_sock(sk);
1723
e994b2f0 1724put_and_return:
3b24d854
ED
1725 if (refcounted)
1726 sock_put(sk);
1da177e4
LT
1727
1728 return ret;
1729
1730no_tcp_socket:
1731 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1732 goto discard_it;
1733
12e25e10 1734 if (tcp_checksum_complete(skb)) {
6a5dc9e5 1735csum_error:
90bbcc60 1736 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1da177e4 1737bad_packet:
90bbcc60 1738 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1da177e4 1739 } else {
cfb6eeb4 1740 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
1741 }
1742
1743discard_it:
1744 /* Discard frame. */
1745 kfree_skb(skb);
e905a9ed 1746 return 0;
1da177e4
LT
1747
1748discard_and_relse:
532182cd 1749 sk_drops_add(sk, skb);
3b24d854
ED
1750 if (refcounted)
1751 sock_put(sk);
1da177e4
LT
1752 goto discard_it;
1753
1754do_time_wait:
1755 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 1756 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1757 goto discard_it;
1758 }
1759
6a5dc9e5
ED
1760 if (tcp_checksum_complete(skb)) {
1761 inet_twsk_put(inet_twsk(sk));
1762 goto csum_error;
1da177e4 1763 }
9469c7b4 1764 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 1765 case TCP_TW_SYN: {
c346dca1 1766 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
a583636a
CG
1767 &tcp_hashinfo, skb,
1768 __tcp_hdrlen(th),
da5e3630 1769 iph->saddr, th->source,
eddc9ec5 1770 iph->daddr, th->dest,
463c84b9 1771 inet_iif(skb));
1da177e4 1772 if (sk2) {
dbe7faa4 1773 inet_twsk_deschedule_put(inet_twsk(sk));
1da177e4 1774 sk = sk2;
3b24d854 1775 refcounted = false;
1da177e4
LT
1776 goto process;
1777 }
1778 /* Fall through to ACK */
1779 }
1780 case TCP_TW_ACK:
1781 tcp_v4_timewait_ack(sk, skb);
1782 break;
1783 case TCP_TW_RST:
271c3b9b
FW
1784 tcp_v4_send_reset(sk, skb);
1785 inet_twsk_deschedule_put(inet_twsk(sk));
1786 goto discard_it;
1da177e4
LT
1787 case TCP_TW_SUCCESS:;
1788 }
1789 goto discard_it;
1790}
1791
ccb7c410
DM
1792static struct timewait_sock_ops tcp_timewait_sock_ops = {
1793 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1794 .twsk_unique = tcp_twsk_unique,
1795 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 1796};
1da177e4 1797
63d02d15 1798void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
1799{
1800 struct dst_entry *dst = skb_dst(skb);
1801
5037e9ef 1802 if (dst && dst_hold_safe(dst)) {
ca777eff
ED
1803 sk->sk_rx_dst = dst;
1804 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1805 }
5d299f3d 1806}
63d02d15 1807EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 1808
3b401a81 1809const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
1810 .queue_xmit = ip_queue_xmit,
1811 .send_check = tcp_v4_send_check,
1812 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 1813 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
1814 .conn_request = tcp_v4_conn_request,
1815 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
1816 .net_header_len = sizeof(struct iphdr),
1817 .setsockopt = ip_setsockopt,
1818 .getsockopt = ip_getsockopt,
1819 .addr2sockaddr = inet_csk_addr2sockaddr,
1820 .sockaddr_len = sizeof(struct sockaddr_in),
3fdadf7d 1821#ifdef CONFIG_COMPAT
543d9cfe
ACM
1822 .compat_setsockopt = compat_ip_setsockopt,
1823 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 1824#endif
4fab9071 1825 .mtu_reduced = tcp_v4_mtu_reduced,
1da177e4 1826};
4bc2f18b 1827EXPORT_SYMBOL(ipv4_specific);
1da177e4 1828
cfb6eeb4 1829#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1830static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 1831 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 1832 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 1833 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 1834};
b6332e6c 1835#endif
cfb6eeb4 1836
1da177e4
LT
1837/* NOTE: A lot of things set to zero explicitly by call to
1838 * sk_alloc() so need not be done here.
1839 */
1840static int tcp_v4_init_sock(struct sock *sk)
1841{
6687e988 1842 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 1843
900f65d3 1844 tcp_init_sock(sk);
1da177e4 1845
8292a17a 1846 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 1847
cfb6eeb4 1848#ifdef CONFIG_TCP_MD5SIG
ac807fa8 1849 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 1850#endif
1da177e4 1851
1da177e4
LT
1852 return 0;
1853}
1854
7d06b2e0 1855void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
1856{
1857 struct tcp_sock *tp = tcp_sk(sk);
1858
1859 tcp_clear_xmit_timers(sk);
1860
6687e988 1861 tcp_cleanup_congestion_control(sk);
317a76f9 1862
1da177e4 1863 /* Cleanup up the write buffer. */
fe067e8a 1864 tcp_write_queue_purge(sk);
1da177e4
LT
1865
1866 /* Cleans up our, hopefully empty, out_of_order_queue. */
9f5afeae 1867 skb_rbtree_purge(&tp->out_of_order_queue);
1da177e4 1868
cfb6eeb4
YH
1869#ifdef CONFIG_TCP_MD5SIG
1870 /* Clean up the MD5 key list, if any */
1871 if (tp->md5sig_info) {
a915da9b 1872 tcp_clear_md5_list(sk);
a8afca03 1873 kfree_rcu(tp->md5sig_info, rcu);
cfb6eeb4
YH
1874 tp->md5sig_info = NULL;
1875 }
1876#endif
1a2449a8 1877
1da177e4
LT
1878 /* Clean prequeue, it must be empty really */
1879 __skb_queue_purge(&tp->ucopy.prequeue);
1880
1881 /* Clean up a referenced TCP bind bucket. */
463c84b9 1882 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 1883 inet_put_port(sk);
1da177e4 1884
00db4124 1885 BUG_ON(tp->fastopen_rsk);
435cf559 1886
cf60af03
YC
1887 /* If socket is aborted during connect operation */
1888 tcp_free_fastopen_req(tp);
cd8ae852 1889 tcp_saved_syn_free(tp);
cf60af03 1890
180d8cd9 1891 sk_sockets_allocated_dec(sk);
1da177e4 1892}
1da177e4
LT
1893EXPORT_SYMBOL(tcp_v4_destroy_sock);
1894
1895#ifdef CONFIG_PROC_FS
1896/* Proc filesystem TCP sock list dumping. */
1897
a8b690f9
TH
1898/*
1899 * Get next listener socket follow cur. If cur is NULL, get first socket
1900 * starting from bucket given in st->bucket; when st->bucket is zero the
1901 * very first socket in the hash table is returned.
1902 */
1da177e4
LT
1903static void *listening_get_next(struct seq_file *seq, void *cur)
1904{
5799de0b 1905 struct tcp_iter_state *st = seq->private;
a4146b1b 1906 struct net *net = seq_file_net(seq);
3b24d854 1907 struct inet_listen_hashbucket *ilb;
3b24d854 1908 struct sock *sk = cur;
1da177e4
LT
1909
1910 if (!sk) {
3b24d854 1911get_head:
a8b690f9 1912 ilb = &tcp_hashinfo.listening_hash[st->bucket];
9652dc2e 1913 spin_lock(&ilb->lock);
3b24d854 1914 sk = sk_head(&ilb->head);
a8b690f9 1915 st->offset = 0;
1da177e4
LT
1916 goto get_sk;
1917 }
5caea4ea 1918 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 1919 ++st->num;
a8b690f9 1920 ++st->offset;
1da177e4 1921
3b24d854 1922 sk = sk_next(sk);
1da177e4 1923get_sk:
3b24d854 1924 sk_for_each_from(sk) {
8475ef9f
PE
1925 if (!net_eq(sock_net(sk), net))
1926 continue;
3b24d854
ED
1927 if (sk->sk_family == st->family)
1928 return sk;
1da177e4 1929 }
9652dc2e 1930 spin_unlock(&ilb->lock);
a8b690f9 1931 st->offset = 0;
3b24d854
ED
1932 if (++st->bucket < INET_LHTABLE_SIZE)
1933 goto get_head;
1934 return NULL;
1da177e4
LT
1935}
1936
1937static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1938{
a8b690f9
TH
1939 struct tcp_iter_state *st = seq->private;
1940 void *rc;
1941
1942 st->bucket = 0;
1943 st->offset = 0;
1944 rc = listening_get_next(seq, NULL);
1da177e4
LT
1945
1946 while (rc && *pos) {
1947 rc = listening_get_next(seq, rc);
1948 --*pos;
1949 }
1950 return rc;
1951}
1952
05dbc7b5 1953static inline bool empty_bucket(const struct tcp_iter_state *st)
6eac5604 1954{
05dbc7b5 1955 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
6eac5604
AK
1956}
1957
a8b690f9
TH
1958/*
1959 * Get first established socket starting from bucket given in st->bucket.
1960 * If st->bucket is zero, the very first socket in the hash is returned.
1961 */
1da177e4
LT
1962static void *established_get_first(struct seq_file *seq)
1963{
5799de0b 1964 struct tcp_iter_state *st = seq->private;
a4146b1b 1965 struct net *net = seq_file_net(seq);
1da177e4
LT
1966 void *rc = NULL;
1967
a8b690f9
TH
1968 st->offset = 0;
1969 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 1970 struct sock *sk;
3ab5aee7 1971 struct hlist_nulls_node *node;
9db66bdc 1972 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 1973
6eac5604
AK
1974 /* Lockless fast path for the common case of empty buckets */
1975 if (empty_bucket(st))
1976 continue;
1977
9db66bdc 1978 spin_lock_bh(lock);
3ab5aee7 1979 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
f40c8174 1980 if (sk->sk_family != st->family ||
878628fb 1981 !net_eq(sock_net(sk), net)) {
1da177e4
LT
1982 continue;
1983 }
1984 rc = sk;
1985 goto out;
1986 }
9db66bdc 1987 spin_unlock_bh(lock);
1da177e4
LT
1988 }
1989out:
1990 return rc;
1991}
1992
1993static void *established_get_next(struct seq_file *seq, void *cur)
1994{
1995 struct sock *sk = cur;
3ab5aee7 1996 struct hlist_nulls_node *node;
5799de0b 1997 struct tcp_iter_state *st = seq->private;
a4146b1b 1998 struct net *net = seq_file_net(seq);
1da177e4
LT
1999
2000 ++st->num;
a8b690f9 2001 ++st->offset;
1da177e4 2002
05dbc7b5 2003 sk = sk_nulls_next(sk);
1da177e4 2004
3ab5aee7 2005 sk_nulls_for_each_from(sk, node) {
878628fb 2006 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
05dbc7b5 2007 return sk;
1da177e4
LT
2008 }
2009
05dbc7b5
ED
2010 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2011 ++st->bucket;
2012 return established_get_first(seq);
1da177e4
LT
2013}
2014
2015static void *established_get_idx(struct seq_file *seq, loff_t pos)
2016{
a8b690f9
TH
2017 struct tcp_iter_state *st = seq->private;
2018 void *rc;
2019
2020 st->bucket = 0;
2021 rc = established_get_first(seq);
1da177e4
LT
2022
2023 while (rc && pos) {
2024 rc = established_get_next(seq, rc);
2025 --pos;
7174259e 2026 }
1da177e4
LT
2027 return rc;
2028}
2029
2030static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2031{
2032 void *rc;
5799de0b 2033 struct tcp_iter_state *st = seq->private;
1da177e4 2034
1da177e4
LT
2035 st->state = TCP_SEQ_STATE_LISTENING;
2036 rc = listening_get_idx(seq, &pos);
2037
2038 if (!rc) {
1da177e4
LT
2039 st->state = TCP_SEQ_STATE_ESTABLISHED;
2040 rc = established_get_idx(seq, pos);
2041 }
2042
2043 return rc;
2044}
2045
a8b690f9
TH
2046static void *tcp_seek_last_pos(struct seq_file *seq)
2047{
2048 struct tcp_iter_state *st = seq->private;
2049 int offset = st->offset;
2050 int orig_num = st->num;
2051 void *rc = NULL;
2052
2053 switch (st->state) {
a8b690f9
TH
2054 case TCP_SEQ_STATE_LISTENING:
2055 if (st->bucket >= INET_LHTABLE_SIZE)
2056 break;
2057 st->state = TCP_SEQ_STATE_LISTENING;
2058 rc = listening_get_next(seq, NULL);
2059 while (offset-- && rc)
2060 rc = listening_get_next(seq, rc);
2061 if (rc)
2062 break;
2063 st->bucket = 0;
05dbc7b5 2064 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2065 /* Fallthrough */
2066 case TCP_SEQ_STATE_ESTABLISHED:
a8b690f9
TH
2067 if (st->bucket > tcp_hashinfo.ehash_mask)
2068 break;
2069 rc = established_get_first(seq);
2070 while (offset-- && rc)
2071 rc = established_get_next(seq, rc);
2072 }
2073
2074 st->num = orig_num;
2075
2076 return rc;
2077}
2078
1da177e4
LT
2079static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2080{
5799de0b 2081 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2082 void *rc;
2083
2084 if (*pos && *pos == st->last_pos) {
2085 rc = tcp_seek_last_pos(seq);
2086 if (rc)
2087 goto out;
2088 }
2089
1da177e4
LT
2090 st->state = TCP_SEQ_STATE_LISTENING;
2091 st->num = 0;
a8b690f9
TH
2092 st->bucket = 0;
2093 st->offset = 0;
2094 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2095
2096out:
2097 st->last_pos = *pos;
2098 return rc;
1da177e4
LT
2099}
2100
2101static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2102{
a8b690f9 2103 struct tcp_iter_state *st = seq->private;
1da177e4 2104 void *rc = NULL;
1da177e4
LT
2105
2106 if (v == SEQ_START_TOKEN) {
2107 rc = tcp_get_idx(seq, 0);
2108 goto out;
2109 }
1da177e4
LT
2110
2111 switch (st->state) {
1da177e4
LT
2112 case TCP_SEQ_STATE_LISTENING:
2113 rc = listening_get_next(seq, v);
2114 if (!rc) {
1da177e4 2115 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2116 st->bucket = 0;
2117 st->offset = 0;
1da177e4
LT
2118 rc = established_get_first(seq);
2119 }
2120 break;
2121 case TCP_SEQ_STATE_ESTABLISHED:
1da177e4
LT
2122 rc = established_get_next(seq, v);
2123 break;
2124 }
2125out:
2126 ++*pos;
a8b690f9 2127 st->last_pos = *pos;
1da177e4
LT
2128 return rc;
2129}
2130
2131static void tcp_seq_stop(struct seq_file *seq, void *v)
2132{
5799de0b 2133 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2134
2135 switch (st->state) {
1da177e4
LT
2136 case TCP_SEQ_STATE_LISTENING:
2137 if (v != SEQ_START_TOKEN)
9652dc2e 2138 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4 2139 break;
1da177e4
LT
2140 case TCP_SEQ_STATE_ESTABLISHED:
2141 if (v)
9db66bdc 2142 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2143 break;
2144 }
2145}
2146
73cb88ec 2147int tcp_seq_open(struct inode *inode, struct file *file)
1da177e4 2148{
d9dda78b 2149 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
1da177e4 2150 struct tcp_iter_state *s;
52d6f3f1 2151 int err;
1da177e4 2152
52d6f3f1
DL
2153 err = seq_open_net(inode, file, &afinfo->seq_ops,
2154 sizeof(struct tcp_iter_state));
2155 if (err < 0)
2156 return err;
f40c8174 2157
52d6f3f1 2158 s = ((struct seq_file *)file->private_data)->private;
1da177e4 2159 s->family = afinfo->family;
688d1945 2160 s->last_pos = 0;
f40c8174
DL
2161 return 0;
2162}
73cb88ec 2163EXPORT_SYMBOL(tcp_seq_open);
f40c8174 2164
6f8b13bc 2165int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4
LT
2166{
2167 int rc = 0;
2168 struct proc_dir_entry *p;
2169
9427c4b3
DL
2170 afinfo->seq_ops.start = tcp_seq_start;
2171 afinfo->seq_ops.next = tcp_seq_next;
2172 afinfo->seq_ops.stop = tcp_seq_stop;
2173
84841c3c 2174 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
73cb88ec 2175 afinfo->seq_fops, afinfo);
84841c3c 2176 if (!p)
1da177e4
LT
2177 rc = -ENOMEM;
2178 return rc;
2179}
4bc2f18b 2180EXPORT_SYMBOL(tcp_proc_register);
1da177e4 2181
6f8b13bc 2182void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4 2183{
ece31ffd 2184 remove_proc_entry(afinfo->name, net->proc_net);
1da177e4 2185}
4bc2f18b 2186EXPORT_SYMBOL(tcp_proc_unregister);
1da177e4 2187
d4f06873 2188static void get_openreq4(const struct request_sock *req,
aa3a0c8c 2189 struct seq_file *f, int i)
1da177e4 2190{
2e6599cb 2191 const struct inet_request_sock *ireq = inet_rsk(req);
fa76ce73 2192 long delta = req->rsk_timer.expires - jiffies;
1da177e4 2193
5e659e4c 2194 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2195 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
1da177e4 2196 i,
634fb979 2197 ireq->ir_loc_addr,
d4f06873 2198 ireq->ir_num,
634fb979
ED
2199 ireq->ir_rmt_addr,
2200 ntohs(ireq->ir_rmt_port),
1da177e4
LT
2201 TCP_SYN_RECV,
2202 0, 0, /* could print option size, but that is af dependent. */
2203 1, /* timers active (only the expire timer) */
a399a805 2204 jiffies_delta_to_clock_t(delta),
e6c022a4 2205 req->num_timeout,
aa3a0c8c
ED
2206 from_kuid_munged(seq_user_ns(f),
2207 sock_i_uid(req->rsk_listener)),
1da177e4
LT
2208 0, /* non standard timer */
2209 0, /* open_requests have no inode */
d4f06873 2210 0,
652586df 2211 req);
1da177e4
LT
2212}
2213
652586df 2214static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
1da177e4
LT
2215{
2216 int timer_active;
2217 unsigned long timer_expires;
cf533ea5 2218 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2219 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2220 const struct inet_sock *inet = inet_sk(sk);
0536fcc0 2221 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2222 __be32 dest = inet->inet_daddr;
2223 __be32 src = inet->inet_rcv_saddr;
2224 __u16 destp = ntohs(inet->inet_dport);
2225 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2226 int rx_queue;
00fd38d9 2227 int state;
1da177e4 2228
6ba8a3b1 2229 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
57dde7f7 2230 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
6ba8a3b1 2231 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2232 timer_active = 1;
463c84b9
ACM
2233 timer_expires = icsk->icsk_timeout;
2234 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2235 timer_active = 4;
463c84b9 2236 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2237 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2238 timer_active = 2;
cf4c6bf8 2239 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2240 } else {
2241 timer_active = 0;
2242 timer_expires = jiffies;
2243 }
2244
00fd38d9
ED
2245 state = sk_state_load(sk);
2246 if (state == TCP_LISTEN)
49d09007
ED
2247 rx_queue = sk->sk_ack_backlog;
2248 else
00fd38d9
ED
2249 /* Because we don't lock the socket,
2250 * we might find a transient negative value.
49d09007
ED
2251 */
2252 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2253
5e659e4c 2254 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
652586df 2255 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
00fd38d9 2256 i, src, srcp, dest, destp, state,
47da8ee6 2257 tp->write_seq - tp->snd_una,
49d09007 2258 rx_queue,
1da177e4 2259 timer_active,
a399a805 2260 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2261 icsk->icsk_retransmits,
a7cb5a49 2262 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2263 icsk->icsk_probes_out,
cf4c6bf8
IJ
2264 sock_i_ino(sk),
2265 atomic_read(&sk->sk_refcnt), sk,
7be87351
SH
2266 jiffies_to_clock_t(icsk->icsk_rto),
2267 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2268 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2269 tp->snd_cwnd,
00fd38d9
ED
2270 state == TCP_LISTEN ?
2271 fastopenq->max_qlen :
652586df 2272 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
1da177e4
LT
2273}
2274
cf533ea5 2275static void get_timewait4_sock(const struct inet_timewait_sock *tw,
652586df 2276 struct seq_file *f, int i)
1da177e4 2277{
789f558c 2278 long delta = tw->tw_timer.expires - jiffies;
23f33c2d 2279 __be32 dest, src;
1da177e4 2280 __u16 destp, srcp;
1da177e4
LT
2281
2282 dest = tw->tw_daddr;
2283 src = tw->tw_rcv_saddr;
2284 destp = ntohs(tw->tw_dport);
2285 srcp = ntohs(tw->tw_sport);
2286
5e659e4c 2287 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2288 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
1da177e4 2289 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2290 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
652586df 2291 atomic_read(&tw->tw_refcnt), tw);
1da177e4
LT
2292}
2293
2294#define TMPSZ 150
2295
2296static int tcp4_seq_show(struct seq_file *seq, void *v)
2297{
5799de0b 2298 struct tcp_iter_state *st;
05dbc7b5 2299 struct sock *sk = v;
1da177e4 2300
652586df 2301 seq_setwidth(seq, TMPSZ - 1);
1da177e4 2302 if (v == SEQ_START_TOKEN) {
652586df 2303 seq_puts(seq, " sl local_address rem_address st tx_queue "
1da177e4
LT
2304 "rx_queue tr tm->when retrnsmt uid timeout "
2305 "inode");
2306 goto out;
2307 }
2308 st = seq->private;
2309
079096f1
ED
2310 if (sk->sk_state == TCP_TIME_WAIT)
2311 get_timewait4_sock(v, seq, st->num);
2312 else if (sk->sk_state == TCP_NEW_SYN_RECV)
aa3a0c8c 2313 get_openreq4(v, seq, st->num);
079096f1
ED
2314 else
2315 get_tcp4_sock(v, seq, st->num);
1da177e4 2316out:
652586df 2317 seq_pad(seq, '\n');
1da177e4
LT
2318 return 0;
2319}
2320
73cb88ec
AV
2321static const struct file_operations tcp_afinfo_seq_fops = {
2322 .owner = THIS_MODULE,
2323 .open = tcp_seq_open,
2324 .read = seq_read,
2325 .llseek = seq_lseek,
2326 .release = seq_release_net
2327};
2328
1da177e4 2329static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4
LT
2330 .name = "tcp",
2331 .family = AF_INET,
73cb88ec 2332 .seq_fops = &tcp_afinfo_seq_fops,
9427c4b3
DL
2333 .seq_ops = {
2334 .show = tcp4_seq_show,
2335 },
1da177e4
LT
2336};
2337
2c8c1e72 2338static int __net_init tcp4_proc_init_net(struct net *net)
757764f6
PE
2339{
2340 return tcp_proc_register(net, &tcp4_seq_afinfo);
2341}
2342
2c8c1e72 2343static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6
PE
2344{
2345 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2346}
2347
2348static struct pernet_operations tcp4_net_ops = {
2349 .init = tcp4_proc_init_net,
2350 .exit = tcp4_proc_exit_net,
2351};
2352
1da177e4
LT
2353int __init tcp4_proc_init(void)
2354{
757764f6 2355 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2356}
2357
2358void tcp4_proc_exit(void)
2359{
757764f6 2360 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2361}
2362#endif /* CONFIG_PROC_FS */
2363
2364struct proto tcp_prot = {
2365 .name = "TCP",
2366 .owner = THIS_MODULE,
2367 .close = tcp_close,
2368 .connect = tcp_v4_connect,
2369 .disconnect = tcp_disconnect,
463c84b9 2370 .accept = inet_csk_accept,
1da177e4
LT
2371 .ioctl = tcp_ioctl,
2372 .init = tcp_v4_init_sock,
2373 .destroy = tcp_v4_destroy_sock,
2374 .shutdown = tcp_shutdown,
2375 .setsockopt = tcp_setsockopt,
2376 .getsockopt = tcp_getsockopt,
4b9d07a4 2377 .keepalive = tcp_set_keepalive,
1da177e4 2378 .recvmsg = tcp_recvmsg,
7ba42910
CG
2379 .sendmsg = tcp_sendmsg,
2380 .sendpage = tcp_sendpage,
1da177e4 2381 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 2382 .release_cb = tcp_release_cb,
ab1e0a13
ACM
2383 .hash = inet_hash,
2384 .unhash = inet_unhash,
2385 .get_port = inet_csk_get_port,
1da177e4 2386 .enter_memory_pressure = tcp_enter_memory_pressure,
c9bee3b7 2387 .stream_memory_free = tcp_stream_memory_free,
1da177e4 2388 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2389 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2390 .memory_allocated = &tcp_memory_allocated,
2391 .memory_pressure = &tcp_memory_pressure,
a4fe34bf 2392 .sysctl_mem = sysctl_tcp_mem,
1da177e4
LT
2393 .sysctl_wmem = sysctl_tcp_wmem,
2394 .sysctl_rmem = sysctl_tcp_rmem,
2395 .max_header = MAX_TCP_HEADER,
2396 .obj_size = sizeof(struct tcp_sock),
3ab5aee7 2397 .slab_flags = SLAB_DESTROY_BY_RCU,
6d6ee43e 2398 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2399 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2400 .h.hashinfo = &tcp_hashinfo,
7ba42910 2401 .no_autobind = true,
543d9cfe
ACM
2402#ifdef CONFIG_COMPAT
2403 .compat_setsockopt = compat_tcp_setsockopt,
2404 .compat_getsockopt = compat_tcp_getsockopt,
d1a4c0b3 2405#endif
c1e64e29 2406 .diag_destroy = tcp_abort,
1da177e4 2407};
4bc2f18b 2408EXPORT_SYMBOL(tcp_prot);
1da177e4 2409
bdbbb852
ED
2410static void __net_exit tcp_sk_exit(struct net *net)
2411{
2412 int cpu;
2413
2414 for_each_possible_cpu(cpu)
2415 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2416 free_percpu(net->ipv4.tcp_sk);
2417}
2418
046ee902
DL
2419static int __net_init tcp_sk_init(struct net *net)
2420{
fee83d09 2421 int res, cpu, cnt;
bdbbb852
ED
2422
2423 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2424 if (!net->ipv4.tcp_sk)
2425 return -ENOMEM;
2426
2427 for_each_possible_cpu(cpu) {
2428 struct sock *sk;
2429
2430 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2431 IPPROTO_TCP, net);
2432 if (res)
2433 goto fail;
a9d6532b 2434 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
bdbbb852
ED
2435 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2436 }
49213555 2437
5d134f1c 2438 net->ipv4.sysctl_tcp_ecn = 2;
49213555
DB
2439 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2440
b0f9ca53 2441 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
6b58e0a5 2442 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
05cbc0db 2443 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
046ee902 2444
13b287e8 2445 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
9bd6861b 2446 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
b840d15d 2447 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
13b287e8 2448
6fa25166 2449 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
7c083ecb 2450 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
0aca737d 2451 net->ipv4.sysctl_tcp_syncookies = 1;
1043e25f 2452 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
ae5c3f40 2453 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
c6214a97 2454 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
c402d9be 2455 net->ipv4.sysctl_tcp_orphan_retries = 0;
1e579caa 2456 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
4979f2d9 2457 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
56ab6b93 2458 net->ipv4.sysctl_tcp_tw_reuse = 0;
12ed8244 2459
fee83d09 2460 cnt = tcp_hashinfo.ehash_mask + 1;
1946e672 2461 net->ipv4.tcp_death_row.sysctl_tw_recycle = 0;
fee83d09 2462 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
1946e672
HY
2463 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2464
fee83d09
HY
2465 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2466
49213555 2467 return 0;
bdbbb852
ED
2468fail:
2469 tcp_sk_exit(net);
2470
2471 return res;
b099ce26
EB
2472}
2473
2474static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2475{
1946e672 2476 inet_twsk_purge(&tcp_hashinfo, AF_INET);
046ee902
DL
2477}
2478
2479static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2480 .init = tcp_sk_init,
2481 .exit = tcp_sk_exit,
2482 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2483};
2484
9b0f976f 2485void __init tcp_v4_init(void)
1da177e4 2486{
6a1b3054 2487 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2488 panic("Failed to create the TCP control socket.\n");
1da177e4 2489}