tcp: move sk_mark_napi_id() at the right place
[linux-2.6-block.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
afd46503 53#define pr_fmt(fmt) "TCP: " fmt
1da177e4 54
eb4dea58 55#include <linux/bottom_half.h>
1da177e4
LT
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
5a0e3ad6 64#include <linux/slab.h>
1da177e4 65
457c4cbc 66#include <net/net_namespace.h>
1da177e4 67#include <net/icmp.h>
304a1618 68#include <net/inet_hashtables.h>
1da177e4 69#include <net/tcp.h>
20380731 70#include <net/transp_v6.h>
1da177e4
LT
71#include <net/ipv6.h>
72#include <net/inet_common.h>
6d6ee43e 73#include <net/timewait_sock.h>
1da177e4 74#include <net/xfrm.h>
6e5714ea 75#include <net/secure_seq.h>
d1a4c0b3 76#include <net/tcp_memcontrol.h>
076bb0c8 77#include <net/busy_poll.h>
1da177e4
LT
78
79#include <linux/inet.h>
80#include <linux/ipv6.h>
81#include <linux/stddef.h>
82#include <linux/proc_fs.h>
83#include <linux/seq_file.h>
84
cfb6eeb4
YH
85#include <linux/crypto.h>
86#include <linux/scatterlist.h>
87
ab32ea5d
BH
88int sysctl_tcp_tw_reuse __read_mostly;
89int sysctl_tcp_low_latency __read_mostly;
4bc2f18b 90EXPORT_SYMBOL(sysctl_tcp_low_latency);
1da177e4 91
cfb6eeb4 92#ifdef CONFIG_TCP_MD5SIG
a915da9b 93static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 94 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
95#endif
96
5caea4ea 97struct inet_hashinfo tcp_hashinfo;
4bc2f18b 98EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 99
936b8bdb 100static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
1da177e4 101{
eddc9ec5
ACM
102 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
103 ip_hdr(skb)->saddr,
aa8223c7
ACM
104 tcp_hdr(skb)->dest,
105 tcp_hdr(skb)->source);
1da177e4
LT
106}
107
6d6ee43e
ACM
108int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109{
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
112
113 /* With PAWS, it is safe from the viewpoint
114 of data integrity. Even without PAWS it is safe provided sequence
115 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
116
117 Actually, the idea is close to VJ's one, only timestamp cache is
118 held not per host, but per port pair and TW bucket is used as state
119 holder.
120
121 If TW bucket has been already destroyed we fall back to VJ's scheme
122 and use initial timestamp retrieved from peer table.
123 */
124 if (tcptw->tw_ts_recent_stamp &&
125 (twp == NULL || (sysctl_tcp_tw_reuse &&
9d729f72 126 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
6d6ee43e
ACM
127 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128 if (tp->write_seq == 0)
129 tp->write_seq = 1;
130 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
131 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
132 sock_hold(sktw);
133 return 1;
134 }
135
136 return 0;
137}
6d6ee43e
ACM
138EXPORT_SYMBOL_GPL(tcp_twsk_unique);
139
1da177e4
LT
140/* This will initiate an outgoing connection. */
141int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
142{
2d7192d6 143 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
144 struct inet_sock *inet = inet_sk(sk);
145 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 146 __be16 orig_sport, orig_dport;
bada8adc 147 __be32 daddr, nexthop;
da905bd1 148 struct flowi4 *fl4;
2d7192d6 149 struct rtable *rt;
1da177e4 150 int err;
f6d8bd05 151 struct ip_options_rcu *inet_opt;
1da177e4
LT
152
153 if (addr_len < sizeof(struct sockaddr_in))
154 return -EINVAL;
155
156 if (usin->sin_family != AF_INET)
157 return -EAFNOSUPPORT;
158
159 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05
ED
160 inet_opt = rcu_dereference_protected(inet->inet_opt,
161 sock_owned_by_user(sk));
162 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
163 if (!daddr)
164 return -EINVAL;
f6d8bd05 165 nexthop = inet_opt->opt.faddr;
1da177e4
LT
166 }
167
dca8b089
DM
168 orig_sport = inet->inet_sport;
169 orig_dport = usin->sin_port;
da905bd1
DM
170 fl4 = &inet->cork.fl.u.ip4;
171 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
172 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173 IPPROTO_TCP,
0e0d44ab 174 orig_sport, orig_dport, sk);
b23dd4fe
DM
175 if (IS_ERR(rt)) {
176 err = PTR_ERR(rt);
177 if (err == -ENETUNREACH)
f1d8cba6 178 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 179 return err;
584bdf8c 180 }
1da177e4
LT
181
182 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
183 ip_rt_put(rt);
184 return -ENETUNREACH;
185 }
186
f6d8bd05 187 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 188 daddr = fl4->daddr;
1da177e4 189
c720c7e8 190 if (!inet->inet_saddr)
da905bd1 191 inet->inet_saddr = fl4->saddr;
c720c7e8 192 inet->inet_rcv_saddr = inet->inet_saddr;
1da177e4 193
c720c7e8 194 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
195 /* Reset inherited state */
196 tp->rx_opt.ts_recent = 0;
197 tp->rx_opt.ts_recent_stamp = 0;
ee995283
PE
198 if (likely(!tp->repair))
199 tp->write_seq = 0;
1da177e4
LT
200 }
201
295ff7ed 202 if (tcp_death_row.sysctl_tw_recycle &&
81166dd6
DM
203 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
204 tcp_fetch_timewait_stamp(sk, &rt->dst);
1da177e4 205
c720c7e8
ED
206 inet->inet_dport = usin->sin_port;
207 inet->inet_daddr = daddr;
1da177e4 208
d83d8461 209 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
210 if (inet_opt)
211 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 212
bee7ca9e 213 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
214
215 /* Socket identity is still unknown (sport may be zero).
216 * However we set state to SYN-SENT and not releasing socket
217 * lock select source port, enter ourselves into the hash tables and
218 * complete initialization after this.
219 */
220 tcp_set_state(sk, TCP_SYN_SENT);
a7f5e7f1 221 err = inet_hash_connect(&tcp_death_row, sk);
1da177e4
LT
222 if (err)
223 goto failure;
224
9e7ceb06
SP
225 inet_set_txhash(sk);
226
da905bd1 227 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
228 inet->inet_sport, inet->inet_dport, sk);
229 if (IS_ERR(rt)) {
230 err = PTR_ERR(rt);
231 rt = NULL;
1da177e4 232 goto failure;
b23dd4fe 233 }
1da177e4 234 /* OK, now commit destination to socket. */
bcd76111 235 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 236 sk_setup_caps(sk, &rt->dst);
1da177e4 237
ee995283 238 if (!tp->write_seq && likely(!tp->repair))
c720c7e8
ED
239 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
240 inet->inet_daddr,
241 inet->inet_sport,
1da177e4
LT
242 usin->sin_port);
243
c720c7e8 244 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4 245
2b916477 246 err = tcp_connect(sk);
ee995283 247
1da177e4
LT
248 rt = NULL;
249 if (err)
250 goto failure;
251
252 return 0;
253
254failure:
7174259e
ACM
255 /*
256 * This unhashes the socket and releases the local port,
257 * if necessary.
258 */
1da177e4
LT
259 tcp_set_state(sk, TCP_CLOSE);
260 ip_rt_put(rt);
261 sk->sk_route_caps = 0;
c720c7e8 262 inet->inet_dport = 0;
1da177e4
LT
263 return err;
264}
4bc2f18b 265EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 266
1da177e4 267/*
563d34d0
ED
268 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269 * It can be called through tcp_release_cb() if socket was owned by user
270 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 271 */
4fab9071 272void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4
LT
273{
274 struct dst_entry *dst;
275 struct inet_sock *inet = inet_sk(sk);
563d34d0 276 u32 mtu = tcp_sk(sk)->mtu_info;
1da177e4 277
80d0a69f
DM
278 dst = inet_csk_update_pmtu(sk, mtu);
279 if (!dst)
1da177e4
LT
280 return;
281
1da177e4
LT
282 /* Something is about to be wrong... Remember soft error
283 * for the case, if this connection will not able to recover.
284 */
285 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286 sk->sk_err_soft = EMSGSIZE;
287
288 mtu = dst_mtu(dst);
289
290 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
482fc609 291 ip_sk_accept_pmtu(sk) &&
d83d8461 292 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
293 tcp_sync_mss(sk, mtu);
294
295 /* Resend the TCP packet because it's
296 * clear that the old packet has been
297 * dropped. This is the new "fast" path mtu
298 * discovery.
299 */
300 tcp_simple_retransmit(sk);
301 } /* else let the usual retransmit timer handle it */
302}
4fab9071 303EXPORT_SYMBOL(tcp_v4_mtu_reduced);
1da177e4 304
55be7a9c
DM
305static void do_redirect(struct sk_buff *skb, struct sock *sk)
306{
307 struct dst_entry *dst = __sk_dst_check(sk, 0);
308
1ed5c48f 309 if (dst)
6700c270 310 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
311}
312
1da177e4
LT
313/*
314 * This routine is called by the ICMP module when it gets some
315 * sort of error condition. If err < 0 then the socket should
316 * be closed and the error returned to the user. If err > 0
317 * it's just the icmp type << 8 | icmp code. After adjustment
318 * header points to the first 8 bytes of the tcp header. We need
319 * to find the appropriate port.
320 *
321 * The locking strategy used here is very "optimistic". When
322 * someone else accesses the socket the ICMP is just dropped
323 * and for some paths there is no check at all.
324 * A more general error queue to queue errors for later handling
325 * is probably better.
326 *
327 */
328
4d1a2d9e 329void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 330{
b71d1d42 331 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 332 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 333 struct inet_connection_sock *icsk;
1da177e4
LT
334 struct tcp_sock *tp;
335 struct inet_sock *inet;
4d1a2d9e
DL
336 const int type = icmp_hdr(icmp_skb)->type;
337 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 338 struct sock *sk;
f1ecd5d9 339 struct sk_buff *skb;
0a672f74
YC
340 struct request_sock *fastopen;
341 __u32 seq, snd_una;
f1ecd5d9 342 __u32 remaining;
1da177e4 343 int err;
4d1a2d9e 344 struct net *net = dev_net(icmp_skb->dev);
1da177e4 345
fd54d716 346 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
4d1a2d9e 347 iph->saddr, th->source, inet_iif(icmp_skb));
1da177e4 348 if (!sk) {
dcfc23ca 349 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
350 return;
351 }
352 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 353 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
354 return;
355 }
356
357 bh_lock_sock(sk);
358 /* If too many ICMPs get dropped on busy
359 * servers this needs to be solved differently.
563d34d0
ED
360 * We do take care of PMTU discovery (RFC1191) special case :
361 * we can receive locally generated ICMP messages while socket is held.
1da177e4 362 */
b74aa930
ED
363 if (sock_owned_by_user(sk)) {
364 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
365 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
366 }
1da177e4
LT
367 if (sk->sk_state == TCP_CLOSE)
368 goto out;
369
97e3ecd1 370 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
371 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
372 goto out;
373 }
374
f1ecd5d9 375 icsk = inet_csk(sk);
1da177e4
LT
376 tp = tcp_sk(sk);
377 seq = ntohl(th->seq);
0a672f74
YC
378 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
379 fastopen = tp->fastopen_rsk;
380 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
1da177e4 381 if (sk->sk_state != TCP_LISTEN &&
0a672f74 382 !between(seq, snd_una, tp->snd_nxt)) {
de0744af 383 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
384 goto out;
385 }
386
387 switch (type) {
55be7a9c
DM
388 case ICMP_REDIRECT:
389 do_redirect(icmp_skb, sk);
390 goto out;
1da177e4
LT
391 case ICMP_SOURCE_QUENCH:
392 /* Just silently ignore these. */
393 goto out;
394 case ICMP_PARAMETERPROB:
395 err = EPROTO;
396 break;
397 case ICMP_DEST_UNREACH:
398 if (code > NR_ICMP_UNREACH)
399 goto out;
400
401 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
402 /* We are not interested in TCP_LISTEN and open_requests
403 * (SYN-ACKs send out by Linux are always <576bytes so
404 * they should go through unfragmented).
405 */
406 if (sk->sk_state == TCP_LISTEN)
407 goto out;
408
563d34d0 409 tp->mtu_info = info;
144d56e9 410 if (!sock_owned_by_user(sk)) {
563d34d0 411 tcp_v4_mtu_reduced(sk);
144d56e9
ED
412 } else {
413 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
414 sock_hold(sk);
415 }
1da177e4
LT
416 goto out;
417 }
418
419 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
420 /* check if icmp_skb allows revert of backoff
421 * (see draft-zimmermann-tcp-lcd) */
422 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
423 break;
424 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
0a672f74 425 !icsk->icsk_backoff || fastopen)
f1ecd5d9
DL
426 break;
427
8f49c270
DM
428 if (sock_owned_by_user(sk))
429 break;
430
f1ecd5d9 431 icsk->icsk_backoff--;
fcdd1cf4
ED
432 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
433 TCP_TIMEOUT_INIT;
434 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
f1ecd5d9
DL
435
436 skb = tcp_write_queue_head(sk);
437 BUG_ON(!skb);
438
7faee5c0
ED
439 remaining = icsk->icsk_rto -
440 min(icsk->icsk_rto,
441 tcp_time_stamp - tcp_skb_timestamp(skb));
f1ecd5d9
DL
442
443 if (remaining) {
444 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
445 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
446 } else {
447 /* RTO revert clocked out retransmission.
448 * Will retransmit now */
449 tcp_retransmit_timer(sk);
450 }
451
1da177e4
LT
452 break;
453 case ICMP_TIME_EXCEEDED:
454 err = EHOSTUNREACH;
455 break;
456 default:
457 goto out;
458 }
459
460 switch (sk->sk_state) {
60236fdd 461 struct request_sock *req, **prev;
1da177e4
LT
462 case TCP_LISTEN:
463 if (sock_owned_by_user(sk))
464 goto out;
465
463c84b9
ACM
466 req = inet_csk_search_req(sk, &prev, th->dest,
467 iph->daddr, iph->saddr);
1da177e4
LT
468 if (!req)
469 goto out;
470
471 /* ICMPs are not backlogged, hence we cannot get
472 an established socket here.
473 */
547b792c 474 WARN_ON(req->sk);
1da177e4 475
2e6599cb 476 if (seq != tcp_rsk(req)->snt_isn) {
de0744af 477 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
478 goto out;
479 }
480
481 /*
482 * Still in SYN_RECV, just remove it silently.
483 * There is no good way to pass the error to the newly
484 * created socket, and POSIX does not want network
485 * errors returned from accept().
486 */
463c84b9 487 inet_csk_reqsk_queue_drop(sk, req, prev);
848bf15f 488 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4
LT
489 goto out;
490
491 case TCP_SYN_SENT:
0a672f74
YC
492 case TCP_SYN_RECV:
493 /* Only in fast or simultaneous open. If a fast open socket is
494 * is already accepted it is treated as a connected one below.
495 */
496 if (fastopen && fastopen->sk == NULL)
497 break;
498
1da177e4 499 if (!sock_owned_by_user(sk)) {
1da177e4
LT
500 sk->sk_err = err;
501
502 sk->sk_error_report(sk);
503
504 tcp_done(sk);
505 } else {
506 sk->sk_err_soft = err;
507 }
508 goto out;
509 }
510
511 /* If we've already connected we will keep trying
512 * until we time out, or the user gives up.
513 *
514 * rfc1122 4.2.3.9 allows to consider as hard errors
515 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
516 * but it is obsoleted by pmtu discovery).
517 *
518 * Note, that in modern internet, where routing is unreliable
519 * and in each dark corner broken firewalls sit, sending random
520 * errors ordered by their masters even this two messages finally lose
521 * their original sense (even Linux sends invalid PORT_UNREACHs)
522 *
523 * Now we are in compliance with RFCs.
524 * --ANK (980905)
525 */
526
527 inet = inet_sk(sk);
528 if (!sock_owned_by_user(sk) && inet->recverr) {
529 sk->sk_err = err;
530 sk->sk_error_report(sk);
531 } else { /* Only an error on timeout */
532 sk->sk_err_soft = err;
533 }
534
535out:
536 bh_unlock_sock(sk);
537 sock_put(sk);
538}
539
28850dc7 540void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 541{
aa8223c7 542 struct tcphdr *th = tcp_hdr(skb);
1da177e4 543
84fa7933 544 if (skb->ip_summed == CHECKSUM_PARTIAL) {
419f9f89 545 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
663ead3b 546 skb->csum_start = skb_transport_header(skb) - skb->head;
ff1dcadb 547 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4 548 } else {
419f9f89 549 th->check = tcp_v4_check(skb->len, saddr, daddr,
07f0757a 550 csum_partial(th,
1da177e4
LT
551 th->doff << 2,
552 skb->csum));
553 }
554}
555
419f9f89 556/* This routine computes an IPv4 TCP checksum. */
bb296246 557void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 558{
cf533ea5 559 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
560
561 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
562}
4bc2f18b 563EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 564
1da177e4
LT
565/*
566 * This routine will send an RST to the other tcp.
567 *
568 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
569 * for reset.
570 * Answer: if a packet caused RST, it is not for a socket
571 * existing in our system, if it is matched to a socket,
572 * it is just duplicate segment or bug in other side's TCP.
573 * So that we build reply only basing on parameters
574 * arrived with segment.
575 * Exception: precedence violation. We do not implement it in any case.
576 */
577
cfb6eeb4 578static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
1da177e4 579{
cf533ea5 580 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
581 struct {
582 struct tcphdr th;
583#ifdef CONFIG_TCP_MD5SIG
714e85be 584 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
585#endif
586 } rep;
1da177e4 587 struct ip_reply_arg arg;
cfb6eeb4
YH
588#ifdef CONFIG_TCP_MD5SIG
589 struct tcp_md5sig_key *key;
658ddaaf
SL
590 const __u8 *hash_location = NULL;
591 unsigned char newhash[16];
592 int genhash;
593 struct sock *sk1 = NULL;
cfb6eeb4 594#endif
a86b1e30 595 struct net *net;
1da177e4
LT
596
597 /* Never send a reset in response to a reset. */
598 if (th->rst)
599 return;
600
511c3f92 601 if (skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
602 return;
603
604 /* Swap the send and the receive. */
cfb6eeb4
YH
605 memset(&rep, 0, sizeof(rep));
606 rep.th.dest = th->source;
607 rep.th.source = th->dest;
608 rep.th.doff = sizeof(struct tcphdr) / 4;
609 rep.th.rst = 1;
1da177e4
LT
610
611 if (th->ack) {
cfb6eeb4 612 rep.th.seq = th->ack_seq;
1da177e4 613 } else {
cfb6eeb4
YH
614 rep.th.ack = 1;
615 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
616 skb->len - (th->doff << 2));
1da177e4
LT
617 }
618
7174259e 619 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
620 arg.iov[0].iov_base = (unsigned char *)&rep;
621 arg.iov[0].iov_len = sizeof(rep.th);
622
623#ifdef CONFIG_TCP_MD5SIG
658ddaaf
SL
624 hash_location = tcp_parse_md5sig_option(th);
625 if (!sk && hash_location) {
626 /*
627 * active side is lost. Try to find listening socket through
628 * source port, and then find md5 key through listening socket.
629 * we are not loose security here:
630 * Incoming packet is checked with md5 hash with finding key,
631 * no RST generated if md5 hash doesn't match.
632 */
633 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
da5e3630
TH
634 &tcp_hashinfo, ip_hdr(skb)->saddr,
635 th->source, ip_hdr(skb)->daddr,
658ddaaf
SL
636 ntohs(th->source), inet_iif(skb));
637 /* don't send rst if it can't find key */
638 if (!sk1)
639 return;
640 rcu_read_lock();
641 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
642 &ip_hdr(skb)->saddr, AF_INET);
643 if (!key)
644 goto release_sk1;
645
646 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
647 if (genhash || memcmp(hash_location, newhash, 16) != 0)
648 goto release_sk1;
649 } else {
650 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
651 &ip_hdr(skb)->saddr,
652 AF_INET) : NULL;
653 }
654
cfb6eeb4
YH
655 if (key) {
656 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
657 (TCPOPT_NOP << 16) |
658 (TCPOPT_MD5SIG << 8) |
659 TCPOLEN_MD5SIG);
660 /* Update length and the length the header thinks exists */
661 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
662 rep.th.doff = arg.iov[0].iov_len / 4;
663
49a72dfb 664 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
665 key, ip_hdr(skb)->saddr,
666 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
667 }
668#endif
eddc9ec5
ACM
669 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
670 ip_hdr(skb)->saddr, /* XXX */
52cd5750 671 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 672 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
88ef4a5a 673 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
e2446eaa 674 /* When socket is gone, all binding information is lost.
4c675258
AK
675 * routing might fail in this case. No choice here, if we choose to force
676 * input interface, we will misroute in case of asymmetric route.
e2446eaa 677 */
4c675258
AK
678 if (sk)
679 arg.bound_dev_if = sk->sk_bound_dev_if;
1da177e4 680
adf30907 681 net = dev_net(skb_dst(skb)->dev);
66b13d99 682 arg.tos = ip_hdr(skb)->tos;
24a2d43d
ED
683 ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
684 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
685 &arg, arg.iov[0].iov_len);
1da177e4 686
63231bdd
PE
687 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
688 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
658ddaaf
SL
689
690#ifdef CONFIG_TCP_MD5SIG
691release_sk1:
692 if (sk1) {
693 rcu_read_unlock();
694 sock_put(sk1);
695 }
696#endif
1da177e4
LT
697}
698
699/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
700 outside socket context is ugly, certainly. What can I do?
701 */
702
9501f972 703static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 704 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 705 struct tcp_md5sig_key *key,
66b13d99 706 int reply_flags, u8 tos)
1da177e4 707{
cf533ea5 708 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
709 struct {
710 struct tcphdr th;
714e85be 711 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 712#ifdef CONFIG_TCP_MD5SIG
714e85be 713 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
714#endif
715 ];
1da177e4
LT
716 } rep;
717 struct ip_reply_arg arg;
adf30907 718 struct net *net = dev_net(skb_dst(skb)->dev);
1da177e4
LT
719
720 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 721 memset(&arg, 0, sizeof(arg));
1da177e4
LT
722
723 arg.iov[0].iov_base = (unsigned char *)&rep;
724 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 725 if (tsecr) {
cfb6eeb4
YH
726 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
727 (TCPOPT_TIMESTAMP << 8) |
728 TCPOLEN_TIMESTAMP);
ee684b6f
AV
729 rep.opt[1] = htonl(tsval);
730 rep.opt[2] = htonl(tsecr);
cb48cfe8 731 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
732 }
733
734 /* Swap the send and the receive. */
735 rep.th.dest = th->source;
736 rep.th.source = th->dest;
737 rep.th.doff = arg.iov[0].iov_len / 4;
738 rep.th.seq = htonl(seq);
739 rep.th.ack_seq = htonl(ack);
740 rep.th.ack = 1;
741 rep.th.window = htons(win);
742
cfb6eeb4 743#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 744 if (key) {
ee684b6f 745 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
746
747 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
748 (TCPOPT_NOP << 16) |
749 (TCPOPT_MD5SIG << 8) |
750 TCPOLEN_MD5SIG);
751 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
752 rep.th.doff = arg.iov[0].iov_len/4;
753
49a72dfb 754 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
755 key, ip_hdr(skb)->saddr,
756 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
757 }
758#endif
88ef4a5a 759 arg.flags = reply_flags;
eddc9ec5
ACM
760 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
761 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
762 arg.iov[0].iov_len, IPPROTO_TCP, 0);
763 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
764 if (oif)
765 arg.bound_dev_if = oif;
66b13d99 766 arg.tos = tos;
24a2d43d
ED
767 ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
768 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
769 &arg, arg.iov[0].iov_len);
1da177e4 770
63231bdd 771 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
1da177e4
LT
772}
773
774static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
775{
8feaf0c0 776 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 777 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 778
9501f972 779 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 780 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
ee684b6f 781 tcp_time_stamp + tcptw->tw_ts_offset,
9501f972
YH
782 tcptw->tw_ts_recent,
783 tw->tw_bound_dev_if,
88ef4a5a 784 tcp_twsk_md5_key(tcptw),
66b13d99
ED
785 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
786 tw->tw_tos
9501f972 787 );
1da177e4 788
8feaf0c0 789 inet_twsk_put(tw);
1da177e4
LT
790}
791
6edafaaf 792static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
7174259e 793 struct request_sock *req)
1da177e4 794{
168a8f58
JC
795 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
796 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
797 */
798 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
799 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
800 tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
ee684b6f 801 tcp_time_stamp,
9501f972
YH
802 req->ts_recent,
803 0,
a915da9b
ED
804 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
805 AF_INET),
66b13d99
ED
806 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
807 ip_hdr(skb)->tos);
1da177e4
LT
808}
809
1da177e4 810/*
9bf1d83e 811 * Send a SYN-ACK after having received a SYN.
60236fdd 812 * This still operates on a request_sock only, not on a big
1da177e4
LT
813 * socket.
814 */
72659ecc 815static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
d6274bd8 816 struct flowi *fl,
72659ecc 817 struct request_sock *req,
843f4a55
YC
818 u16 queue_mapping,
819 struct tcp_fastopen_cookie *foc)
1da177e4 820{
2e6599cb 821 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 822 struct flowi4 fl4;
1da177e4 823 int err = -1;
d41db5af 824 struct sk_buff *skb;
1da177e4
LT
825
826 /* First, grab a route. */
ba3f7f04 827 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 828 return -1;
1da177e4 829
843f4a55 830 skb = tcp_make_synack(sk, dst, req, foc);
1da177e4
LT
831
832 if (skb) {
634fb979 833 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1da177e4 834
fff32699 835 skb_set_queue_mapping(skb, queue_mapping);
634fb979
ED
836 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
837 ireq->ir_rmt_addr,
2e6599cb 838 ireq->opt);
b9df3cb8 839 err = net_xmit_eval(err);
1da177e4
LT
840 }
841
1da177e4
LT
842 return err;
843}
844
845/*
60236fdd 846 * IPv4 request_sock destructor.
1da177e4 847 */
60236fdd 848static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 849{
a51482bd 850 kfree(inet_rsk(req)->opt);
1da177e4
LT
851}
852
946cedcc 853/*
a2a385d6 854 * Return true if a syncookie should be sent
946cedcc 855 */
a2a385d6 856bool tcp_syn_flood_action(struct sock *sk,
946cedcc
ED
857 const struct sk_buff *skb,
858 const char *proto)
1da177e4 859{
946cedcc 860 const char *msg = "Dropping request";
a2a385d6 861 bool want_cookie = false;
946cedcc
ED
862 struct listen_sock *lopt;
863
2a1d4bd4 864#ifdef CONFIG_SYN_COOKIES
946cedcc 865 if (sysctl_tcp_syncookies) {
2a1d4bd4 866 msg = "Sending cookies";
a2a385d6 867 want_cookie = true;
946cedcc
ED
868 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
869 } else
80e40daa 870#endif
946cedcc
ED
871 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
872
873 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
5ad37d5d 874 if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
946cedcc 875 lopt->synflood_warned = 1;
afd46503 876 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
946cedcc
ED
877 proto, ntohs(tcp_hdr(skb)->dest), msg);
878 }
879 return want_cookie;
2a1d4bd4 880}
946cedcc 881EXPORT_SYMBOL(tcp_syn_flood_action);
1da177e4 882
cfb6eeb4
YH
883#ifdef CONFIG_TCP_MD5SIG
884/*
885 * RFC2385 MD5 checksumming requires a mapping of
886 * IP address->MD5 Key.
887 * We need to maintain these in the sk structure.
888 */
889
890/* Find the Key structure for an address. */
a915da9b
ED
891struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
892 const union tcp_md5_addr *addr,
893 int family)
cfb6eeb4
YH
894{
895 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 896 struct tcp_md5sig_key *key;
a915da9b 897 unsigned int size = sizeof(struct in_addr);
a8afca03 898 struct tcp_md5sig_info *md5sig;
cfb6eeb4 899
a8afca03
ED
900 /* caller either holds rcu_read_lock() or socket lock */
901 md5sig = rcu_dereference_check(tp->md5sig_info,
b4fb05ea
ED
902 sock_owned_by_user(sk) ||
903 lockdep_is_held(&sk->sk_lock.slock));
a8afca03 904 if (!md5sig)
cfb6eeb4 905 return NULL;
a915da9b
ED
906#if IS_ENABLED(CONFIG_IPV6)
907 if (family == AF_INET6)
908 size = sizeof(struct in6_addr);
909#endif
b67bfe0d 910 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
a915da9b
ED
911 if (key->family != family)
912 continue;
913 if (!memcmp(&key->addr, addr, size))
914 return key;
cfb6eeb4
YH
915 }
916 return NULL;
917}
a915da9b 918EXPORT_SYMBOL(tcp_md5_do_lookup);
cfb6eeb4
YH
919
920struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
921 struct sock *addr_sk)
922{
a915da9b
ED
923 union tcp_md5_addr *addr;
924
925 addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
926 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4 927}
cfb6eeb4
YH
928EXPORT_SYMBOL(tcp_v4_md5_lookup);
929
f5b99bcd
AB
930static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
931 struct request_sock *req)
cfb6eeb4 932{
a915da9b
ED
933 union tcp_md5_addr *addr;
934
634fb979 935 addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
a915da9b 936 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4
YH
937}
938
939/* This can be called on a newly created socket, from other files */
a915da9b
ED
940int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
941 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
cfb6eeb4
YH
942{
943 /* Add Key to the list */
b0a713e9 944 struct tcp_md5sig_key *key;
cfb6eeb4 945 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 946 struct tcp_md5sig_info *md5sig;
cfb6eeb4 947
c0353c7b 948 key = tcp_md5_do_lookup(sk, addr, family);
cfb6eeb4
YH
949 if (key) {
950 /* Pre-existing entry - just update that one. */
a915da9b 951 memcpy(key->key, newkey, newkeylen);
b0a713e9 952 key->keylen = newkeylen;
a915da9b
ED
953 return 0;
954 }
260fcbeb 955
a8afca03
ED
956 md5sig = rcu_dereference_protected(tp->md5sig_info,
957 sock_owned_by_user(sk));
a915da9b
ED
958 if (!md5sig) {
959 md5sig = kmalloc(sizeof(*md5sig), gfp);
960 if (!md5sig)
cfb6eeb4 961 return -ENOMEM;
cfb6eeb4 962
a915da9b
ED
963 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
964 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 965 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 966 }
cfb6eeb4 967
5f3d9cb2 968 key = sock_kmalloc(sk, sizeof(*key), gfp);
a915da9b
ED
969 if (!key)
970 return -ENOMEM;
71cea17e 971 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 972 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 973 return -ENOMEM;
cfb6eeb4 974 }
a915da9b
ED
975
976 memcpy(key->key, newkey, newkeylen);
977 key->keylen = newkeylen;
978 key->family = family;
979 memcpy(&key->addr, addr,
980 (family == AF_INET6) ? sizeof(struct in6_addr) :
981 sizeof(struct in_addr));
982 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
983 return 0;
984}
a915da9b 985EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 986
a915da9b 987int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
cfb6eeb4 988{
a915da9b
ED
989 struct tcp_md5sig_key *key;
990
c0353c7b 991 key = tcp_md5_do_lookup(sk, addr, family);
a915da9b
ED
992 if (!key)
993 return -ENOENT;
994 hlist_del_rcu(&key->node);
5f3d9cb2 995 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 996 kfree_rcu(key, rcu);
a915da9b 997 return 0;
cfb6eeb4 998}
a915da9b 999EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 1000
e0683e70 1001static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
1002{
1003 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1004 struct tcp_md5sig_key *key;
b67bfe0d 1005 struct hlist_node *n;
a8afca03 1006 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1007
a8afca03
ED
1008 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1009
b67bfe0d 1010 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1011 hlist_del_rcu(&key->node);
5f3d9cb2 1012 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1013 kfree_rcu(key, rcu);
cfb6eeb4
YH
1014 }
1015}
1016
7174259e
ACM
1017static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1018 int optlen)
cfb6eeb4
YH
1019{
1020 struct tcp_md5sig cmd;
1021 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
cfb6eeb4
YH
1022
1023 if (optlen < sizeof(cmd))
1024 return -EINVAL;
1025
7174259e 1026 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1027 return -EFAULT;
1028
1029 if (sin->sin_family != AF_INET)
1030 return -EINVAL;
1031
64a124ed 1032 if (!cmd.tcpm_keylen)
a915da9b
ED
1033 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1034 AF_INET);
cfb6eeb4
YH
1035
1036 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1037 return -EINVAL;
1038
a915da9b
ED
1039 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1040 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1041 GFP_KERNEL);
cfb6eeb4
YH
1042}
1043
49a72dfb
AL
1044static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1045 __be32 daddr, __be32 saddr, int nbytes)
cfb6eeb4 1046{
cfb6eeb4 1047 struct tcp4_pseudohdr *bp;
49a72dfb 1048 struct scatterlist sg;
cfb6eeb4
YH
1049
1050 bp = &hp->md5_blk.ip4;
cfb6eeb4
YH
1051
1052 /*
49a72dfb 1053 * 1. the TCP pseudo-header (in the order: source IP address,
cfb6eeb4
YH
1054 * destination IP address, zero-padded protocol number, and
1055 * segment length)
1056 */
1057 bp->saddr = saddr;
1058 bp->daddr = daddr;
1059 bp->pad = 0;
076fb722 1060 bp->protocol = IPPROTO_TCP;
49a72dfb 1061 bp->len = cpu_to_be16(nbytes);
c7da57a1 1062
49a72dfb
AL
1063 sg_init_one(&sg, bp, sizeof(*bp));
1064 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1065}
1066
a915da9b 1067static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1068 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1069{
1070 struct tcp_md5sig_pool *hp;
1071 struct hash_desc *desc;
1072
1073 hp = tcp_get_md5sig_pool();
1074 if (!hp)
1075 goto clear_hash_noput;
1076 desc = &hp->md5_desc;
1077
1078 if (crypto_hash_init(desc))
1079 goto clear_hash;
1080 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1081 goto clear_hash;
1082 if (tcp_md5_hash_header(hp, th))
1083 goto clear_hash;
1084 if (tcp_md5_hash_key(hp, key))
1085 goto clear_hash;
1086 if (crypto_hash_final(desc, md5_hash))
cfb6eeb4
YH
1087 goto clear_hash;
1088
cfb6eeb4 1089 tcp_put_md5sig_pool();
cfb6eeb4 1090 return 0;
49a72dfb 1091
cfb6eeb4
YH
1092clear_hash:
1093 tcp_put_md5sig_pool();
1094clear_hash_noput:
1095 memset(md5_hash, 0, 16);
49a72dfb 1096 return 1;
cfb6eeb4
YH
1097}
1098
49a72dfb 1099int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
318cf7aa
ED
1100 const struct sock *sk, const struct request_sock *req,
1101 const struct sk_buff *skb)
cfb6eeb4 1102{
49a72dfb
AL
1103 struct tcp_md5sig_pool *hp;
1104 struct hash_desc *desc;
318cf7aa 1105 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1106 __be32 saddr, daddr;
1107
1108 if (sk) {
c720c7e8
ED
1109 saddr = inet_sk(sk)->inet_saddr;
1110 daddr = inet_sk(sk)->inet_daddr;
49a72dfb 1111 } else if (req) {
634fb979
ED
1112 saddr = inet_rsk(req)->ir_loc_addr;
1113 daddr = inet_rsk(req)->ir_rmt_addr;
cfb6eeb4 1114 } else {
49a72dfb
AL
1115 const struct iphdr *iph = ip_hdr(skb);
1116 saddr = iph->saddr;
1117 daddr = iph->daddr;
cfb6eeb4 1118 }
49a72dfb
AL
1119
1120 hp = tcp_get_md5sig_pool();
1121 if (!hp)
1122 goto clear_hash_noput;
1123 desc = &hp->md5_desc;
1124
1125 if (crypto_hash_init(desc))
1126 goto clear_hash;
1127
1128 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1129 goto clear_hash;
1130 if (tcp_md5_hash_header(hp, th))
1131 goto clear_hash;
1132 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1133 goto clear_hash;
1134 if (tcp_md5_hash_key(hp, key))
1135 goto clear_hash;
1136 if (crypto_hash_final(desc, md5_hash))
1137 goto clear_hash;
1138
1139 tcp_put_md5sig_pool();
1140 return 0;
1141
1142clear_hash:
1143 tcp_put_md5sig_pool();
1144clear_hash_noput:
1145 memset(md5_hash, 0, 16);
1146 return 1;
cfb6eeb4 1147}
49a72dfb 1148EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1149
9ea88a15
DP
1150static bool __tcp_v4_inbound_md5_hash(struct sock *sk,
1151 const struct sk_buff *skb)
cfb6eeb4
YH
1152{
1153 /*
1154 * This gets called for each TCP segment that arrives
1155 * so we want to be efficient.
1156 * We have 3 drop cases:
1157 * o No MD5 hash and one expected.
1158 * o MD5 hash and we're not expecting one.
1159 * o MD5 hash and its wrong.
1160 */
cf533ea5 1161 const __u8 *hash_location = NULL;
cfb6eeb4 1162 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1163 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1164 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1165 int genhash;
cfb6eeb4
YH
1166 unsigned char newhash[16];
1167
a915da9b
ED
1168 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1169 AF_INET);
7d5d5525 1170 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1171
cfb6eeb4
YH
1172 /* We've parsed the options - do we have a hash? */
1173 if (!hash_expected && !hash_location)
a2a385d6 1174 return false;
cfb6eeb4
YH
1175
1176 if (hash_expected && !hash_location) {
785957d3 1177 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1178 return true;
cfb6eeb4
YH
1179 }
1180
1181 if (!hash_expected && hash_location) {
785957d3 1182 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1183 return true;
cfb6eeb4
YH
1184 }
1185
1186 /* Okay, so this is hash_expected and hash_location -
1187 * so we need to calculate the checksum.
1188 */
49a72dfb
AL
1189 genhash = tcp_v4_md5_hash_skb(newhash,
1190 hash_expected,
1191 NULL, NULL, skb);
cfb6eeb4
YH
1192
1193 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
e87cc472
JP
1194 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1195 &iph->saddr, ntohs(th->source),
1196 &iph->daddr, ntohs(th->dest),
1197 genhash ? " tcp_v4_calc_md5_hash failed"
1198 : "");
a2a385d6 1199 return true;
cfb6eeb4 1200 }
a2a385d6 1201 return false;
cfb6eeb4
YH
1202}
1203
9ea88a15
DP
1204static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1205{
1206 bool ret;
1207
1208 rcu_read_lock();
1209 ret = __tcp_v4_inbound_md5_hash(sk, skb);
1210 rcu_read_unlock();
1211
1212 return ret;
1213}
1214
cfb6eeb4
YH
1215#endif
1216
16bea70a
OP
1217static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
1218 struct sk_buff *skb)
1219{
1220 struct inet_request_sock *ireq = inet_rsk(req);
1221
1222 ireq->ir_loc_addr = ip_hdr(skb)->daddr;
1223 ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
1224 ireq->no_srccheck = inet_sk(sk)->transparent;
1225 ireq->opt = tcp_v4_save_options(skb);
1226}
1227
d94e0417
OP
1228static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
1229 const struct request_sock *req,
1230 bool *strict)
1231{
1232 struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1233
1234 if (strict) {
1235 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1236 *strict = true;
1237 else
1238 *strict = false;
1239 }
1240
1241 return dst;
1242}
1243
72a3effa 1244struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1245 .family = PF_INET,
2e6599cb 1246 .obj_size = sizeof(struct tcp_request_sock),
5db92c99 1247 .rtx_syn_ack = tcp_rtx_synack,
60236fdd
ACM
1248 .send_ack = tcp_v4_reqsk_send_ack,
1249 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1250 .send_reset = tcp_v4_send_reset,
688d1945 1251 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1252};
1253
b2e4b3de 1254static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
2aec4a29 1255 .mss_clamp = TCP_MSS_DEFAULT,
16bea70a 1256#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 1257 .md5_lookup = tcp_v4_reqsk_md5_lookup,
e3afe7b7 1258 .calc_md5_hash = tcp_v4_md5_hash_skb,
b6332e6c 1259#endif
16bea70a 1260 .init_req = tcp_v4_init_req,
fb7b37a7
OP
1261#ifdef CONFIG_SYN_COOKIES
1262 .cookie_init_seq = cookie_v4_init_sequence,
1263#endif
d94e0417 1264 .route_req = tcp_v4_route_req,
936b8bdb 1265 .init_seq = tcp_v4_init_sequence,
d6274bd8 1266 .send_synack = tcp_v4_send_synack,
695da14e 1267 .queue_hash_add = inet_csk_reqsk_queue_hash_add,
16bea70a 1268};
cfb6eeb4 1269
1da177e4
LT
1270int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1271{
1da177e4 1272 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1273 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1274 goto drop;
1275
1fb6f159
OP
1276 return tcp_conn_request(&tcp_request_sock_ops,
1277 &tcp_request_sock_ipv4_ops, sk, skb);
1da177e4 1278
1da177e4 1279drop:
848bf15f 1280 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4
LT
1281 return 0;
1282}
4bc2f18b 1283EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1284
1285
1286/*
1287 * The three way handshake has completed - we got a valid synack -
1288 * now create the new socket.
1289 */
1290struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
60236fdd 1291 struct request_sock *req,
1da177e4
LT
1292 struct dst_entry *dst)
1293{
2e6599cb 1294 struct inet_request_sock *ireq;
1da177e4
LT
1295 struct inet_sock *newinet;
1296 struct tcp_sock *newtp;
1297 struct sock *newsk;
cfb6eeb4
YH
1298#ifdef CONFIG_TCP_MD5SIG
1299 struct tcp_md5sig_key *key;
1300#endif
f6d8bd05 1301 struct ip_options_rcu *inet_opt;
1da177e4
LT
1302
1303 if (sk_acceptq_is_full(sk))
1304 goto exit_overflow;
1305
1da177e4
LT
1306 newsk = tcp_create_openreq_child(sk, req, skb);
1307 if (!newsk)
093d2823 1308 goto exit_nonewsk;
1da177e4 1309
bcd76111 1310 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1311 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1312
1313 newtp = tcp_sk(newsk);
1314 newinet = inet_sk(newsk);
2e6599cb 1315 ireq = inet_rsk(req);
634fb979
ED
1316 newinet->inet_daddr = ireq->ir_rmt_addr;
1317 newinet->inet_rcv_saddr = ireq->ir_loc_addr;
1318 newinet->inet_saddr = ireq->ir_loc_addr;
f6d8bd05
ED
1319 inet_opt = ireq->opt;
1320 rcu_assign_pointer(newinet->inet_opt, inet_opt);
2e6599cb 1321 ireq->opt = NULL;
463c84b9 1322 newinet->mc_index = inet_iif(skb);
eddc9ec5 1323 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1324 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1325 inet_csk(newsk)->icsk_ext_hdr_len = 0;
b73c3d0e 1326 inet_set_txhash(newsk);
f6d8bd05
ED
1327 if (inet_opt)
1328 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
c720c7e8 1329 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1330
dfd25fff
ED
1331 if (!dst) {
1332 dst = inet_csk_route_child_sock(sk, newsk, req);
1333 if (!dst)
1334 goto put_and_exit;
1335 } else {
1336 /* syncookie case : see end of cookie_v4_check() */
1337 }
0e734419
DM
1338 sk_setup_caps(newsk, dst);
1339
1da177e4 1340 tcp_sync_mss(newsk, dst_mtu(dst));
0dbaee3b 1341 newtp->advmss = dst_metric_advmss(dst);
f5fff5dc
TQ
1342 if (tcp_sk(sk)->rx_opt.user_mss &&
1343 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1344 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1345
1da177e4
LT
1346 tcp_initialize_rcv_mss(newsk);
1347
cfb6eeb4
YH
1348#ifdef CONFIG_TCP_MD5SIG
1349 /* Copy over the MD5 key from the original socket */
a915da9b
ED
1350 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1351 AF_INET);
c720c7e8 1352 if (key != NULL) {
cfb6eeb4
YH
1353 /*
1354 * We're using one, so create a matching key
1355 * on the newsk structure. If we fail to get
1356 * memory, then we end up not copying the key
1357 * across. Shucks.
1358 */
a915da9b
ED
1359 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1360 AF_INET, key->key, key->keylen, GFP_ATOMIC);
a465419b 1361 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1362 }
1363#endif
1364
0e734419
DM
1365 if (__inet_inherit_port(sk, newsk) < 0)
1366 goto put_and_exit;
9327f705 1367 __inet_hash_nolisten(newsk, NULL);
1da177e4
LT
1368
1369 return newsk;
1370
1371exit_overflow:
de0744af 1372 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1373exit_nonewsk:
1374 dst_release(dst);
1da177e4 1375exit:
de0744af 1376 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4 1377 return NULL;
0e734419 1378put_and_exit:
e337e24d
CP
1379 inet_csk_prepare_forced_close(newsk);
1380 tcp_done(newsk);
0e734419 1381 goto exit;
1da177e4 1382}
4bc2f18b 1383EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4
LT
1384
1385static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1386{
aa8223c7 1387 struct tcphdr *th = tcp_hdr(skb);
eddc9ec5 1388 const struct iphdr *iph = ip_hdr(skb);
1da177e4 1389 struct sock *nsk;
60236fdd 1390 struct request_sock **prev;
1da177e4 1391 /* Find possible connection requests. */
463c84b9
ACM
1392 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1393 iph->saddr, iph->daddr);
1da177e4 1394 if (req)
8336886f 1395 return tcp_check_req(sk, skb, req, prev, false);
1da177e4 1396
3b1e0a65 1397 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
c67499c0 1398 th->source, iph->daddr, th->dest, inet_iif(skb));
1da177e4
LT
1399
1400 if (nsk) {
1401 if (nsk->sk_state != TCP_TIME_WAIT) {
1402 bh_lock_sock(nsk);
1403 return nsk;
1404 }
9469c7b4 1405 inet_twsk_put(inet_twsk(nsk));
1da177e4
LT
1406 return NULL;
1407 }
1408
1409#ifdef CONFIG_SYN_COOKIES
af9b4738 1410 if (!th->syn)
461b74c3 1411 sk = cookie_v4_check(sk, skb);
1da177e4
LT
1412#endif
1413 return sk;
1414}
1415
1da177e4
LT
1416/* The socket must have it's spinlock held when we get
1417 * here.
1418 *
1419 * We have a potential double-lock case here, so even when
1420 * doing backlog processing we use the BH locking scheme.
1421 * This is because we cannot sleep with the original spinlock
1422 * held.
1423 */
1424int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1425{
cfb6eeb4 1426 struct sock *rsk;
cfb6eeb4 1427
1da177e4 1428 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1429 struct dst_entry *dst = sk->sk_rx_dst;
1430
bdeab991 1431 sock_rps_save_rxhash(sk, skb);
3d97379a 1432 sk_mark_napi_id(sk, skb);
404e0a8b 1433 if (dst) {
505fbcf0
ED
1434 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1435 dst->ops->check(dst, 0) == NULL) {
92101b3b
DM
1436 dst_release(dst);
1437 sk->sk_rx_dst = NULL;
1438 }
1439 }
c995ae22 1440 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1da177e4
LT
1441 return 0;
1442 }
1443
ab6a5bb6 1444 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1da177e4
LT
1445 goto csum_err;
1446
1447 if (sk->sk_state == TCP_LISTEN) {
1448 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1449 if (!nsk)
1450 goto discard;
1451
1452 if (nsk != sk) {
bdeab991 1453 sock_rps_save_rxhash(nsk, skb);
3d97379a 1454 sk_mark_napi_id(sk, skb);
cfb6eeb4
YH
1455 if (tcp_child_process(sk, nsk, skb)) {
1456 rsk = nsk;
1da177e4 1457 goto reset;
cfb6eeb4 1458 }
1da177e4
LT
1459 return 0;
1460 }
ca55158c 1461 } else
bdeab991 1462 sock_rps_save_rxhash(sk, skb);
ca55158c 1463
aa8223c7 1464 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1465 rsk = sk;
1da177e4 1466 goto reset;
cfb6eeb4 1467 }
1da177e4
LT
1468 return 0;
1469
1470reset:
cfb6eeb4 1471 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1472discard:
1473 kfree_skb(skb);
1474 /* Be careful here. If this function gets more complicated and
1475 * gcc suffers from register pressure on the x86, sk (in %ebx)
1476 * might be destroyed here. This current version compiles correctly,
1477 * but you have been warned.
1478 */
1479 return 0;
1480
1481csum_err:
6a5dc9e5 1482 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
63231bdd 1483 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1484 goto discard;
1485}
4bc2f18b 1486EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1487
160eb5a6 1488void tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1489{
41063e9d
DM
1490 const struct iphdr *iph;
1491 const struct tcphdr *th;
1492 struct sock *sk;
41063e9d 1493
41063e9d 1494 if (skb->pkt_type != PACKET_HOST)
160eb5a6 1495 return;
41063e9d 1496
45f00f99 1497 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
160eb5a6 1498 return;
41063e9d
DM
1499
1500 iph = ip_hdr(skb);
45f00f99 1501 th = tcp_hdr(skb);
41063e9d
DM
1502
1503 if (th->doff < sizeof(struct tcphdr) / 4)
160eb5a6 1504 return;
41063e9d 1505
45f00f99 1506 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1507 iph->saddr, th->source,
7011d085 1508 iph->daddr, ntohs(th->dest),
9cb429d6 1509 skb->skb_iif);
41063e9d
DM
1510 if (sk) {
1511 skb->sk = sk;
1512 skb->destructor = sock_edemux;
1513 if (sk->sk_state != TCP_TIME_WAIT) {
1514 struct dst_entry *dst = sk->sk_rx_dst;
505fbcf0 1515
41063e9d
DM
1516 if (dst)
1517 dst = dst_check(dst, 0);
92101b3b 1518 if (dst &&
505fbcf0 1519 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1520 skb_dst_set_noref(skb, dst);
41063e9d
DM
1521 }
1522 }
41063e9d
DM
1523}
1524
b2fb4f54
ED
1525/* Packet is added to VJ-style prequeue for processing in process
1526 * context, if a reader task is waiting. Apparently, this exciting
1527 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1528 * failed somewhere. Latency? Burstiness? Well, at least now we will
1529 * see, why it failed. 8)8) --ANK
1530 *
1531 */
1532bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1533{
1534 struct tcp_sock *tp = tcp_sk(sk);
1535
1536 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1537 return false;
1538
1539 if (skb->len <= tcp_hdrlen(skb) &&
1540 skb_queue_len(&tp->ucopy.prequeue) == 0)
1541 return false;
1542
ca777eff
ED
1543 /* Before escaping RCU protected region, we need to take care of skb
1544 * dst. Prequeue is only enabled for established sockets.
1545 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1546 * Instead of doing full sk_rx_dst validity here, let's perform
1547 * an optimistic check.
1548 */
1549 if (likely(sk->sk_rx_dst))
1550 skb_dst_drop(skb);
1551 else
1552 skb_dst_force(skb);
1553
b2fb4f54
ED
1554 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1555 tp->ucopy.memory += skb->truesize;
1556 if (tp->ucopy.memory > sk->sk_rcvbuf) {
1557 struct sk_buff *skb1;
1558
1559 BUG_ON(sock_owned_by_user(sk));
1560
1561 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1562 sk_backlog_rcv(sk, skb1);
1563 NET_INC_STATS_BH(sock_net(sk),
1564 LINUX_MIB_TCPPREQUEUEDROPPED);
1565 }
1566
1567 tp->ucopy.memory = 0;
1568 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1569 wake_up_interruptible_sync_poll(sk_sleep(sk),
1570 POLLIN | POLLRDNORM | POLLRDBAND);
1571 if (!inet_csk_ack_scheduled(sk))
1572 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1573 (3 * tcp_rto_min(sk)) / 4,
1574 TCP_RTO_MAX);
1575 }
1576 return true;
1577}
1578EXPORT_SYMBOL(tcp_prequeue);
1579
1da177e4
LT
1580/*
1581 * From tcp_input.c
1582 */
1583
1584int tcp_v4_rcv(struct sk_buff *skb)
1585{
eddc9ec5 1586 const struct iphdr *iph;
cf533ea5 1587 const struct tcphdr *th;
1da177e4
LT
1588 struct sock *sk;
1589 int ret;
a86b1e30 1590 struct net *net = dev_net(skb->dev);
1da177e4
LT
1591
1592 if (skb->pkt_type != PACKET_HOST)
1593 goto discard_it;
1594
1595 /* Count it even if it's bad */
63231bdd 1596 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1da177e4
LT
1597
1598 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1599 goto discard_it;
1600
aa8223c7 1601 th = tcp_hdr(skb);
1da177e4
LT
1602
1603 if (th->doff < sizeof(struct tcphdr) / 4)
1604 goto bad_packet;
1605 if (!pskb_may_pull(skb, th->doff * 4))
1606 goto discard_it;
1607
1608 /* An explanation is required here, I think.
1609 * Packet length and doff are validated by header prediction,
caa20d9a 1610 * provided case of th->doff==0 is eliminated.
1da177e4 1611 * So, we defer the checks. */
ed70fcfc
TH
1612
1613 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
6a5dc9e5 1614 goto csum_error;
1da177e4 1615
aa8223c7 1616 th = tcp_hdr(skb);
eddc9ec5 1617 iph = ip_hdr(skb);
971f10ec
ED
1618 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1619 * barrier() makes sure compiler wont play fool^Waliasing games.
1620 */
1621 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1622 sizeof(struct inet_skb_parm));
1623 barrier();
1624
1da177e4
LT
1625 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1626 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1627 skb->len - th->doff * 4);
1628 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
e11ecddf 1629 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
04317daf 1630 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
b82d1bb4 1631 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1da177e4
LT
1632 TCP_SKB_CB(skb)->sacked = 0;
1633
9a1f27c4 1634 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1da177e4
LT
1635 if (!sk)
1636 goto no_tcp_socket;
1637
bb134d5d
ED
1638process:
1639 if (sk->sk_state == TCP_TIME_WAIT)
1640 goto do_time_wait;
1641
6cce09f8
ED
1642 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1643 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1644 goto discard_and_relse;
6cce09f8 1645 }
d218d111 1646
1da177e4
LT
1647 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1648 goto discard_and_relse;
9ea88a15
DP
1649
1650#ifdef CONFIG_TCP_MD5SIG
1651 /*
1652 * We really want to reject the packet as early as possible
1653 * if:
1654 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1655 * o There is an MD5 option and we're not expecting one
1656 */
1657 if (tcp_v4_inbound_md5_hash(sk, skb))
1658 goto discard_and_relse;
1659#endif
1660
b59c2701 1661 nf_reset(skb);
1da177e4 1662
fda9ef5d 1663 if (sk_filter(sk, skb))
1da177e4
LT
1664 goto discard_and_relse;
1665
1666 skb->dev = NULL;
1667
c6366184 1668 bh_lock_sock_nested(sk);
1da177e4
LT
1669 ret = 0;
1670 if (!sock_owned_by_user(sk)) {
7bced397 1671 if (!tcp_prequeue(sk, skb))
1da177e4 1672 ret = tcp_v4_do_rcv(sk, skb);
da882c1f
ED
1673 } else if (unlikely(sk_add_backlog(sk, skb,
1674 sk->sk_rcvbuf + sk->sk_sndbuf))) {
6b03a53a 1675 bh_unlock_sock(sk);
6cce09f8 1676 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
6b03a53a
ZY
1677 goto discard_and_relse;
1678 }
1da177e4
LT
1679 bh_unlock_sock(sk);
1680
1681 sock_put(sk);
1682
1683 return ret;
1684
1685no_tcp_socket:
1686 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1687 goto discard_it;
1688
1689 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
6a5dc9e5
ED
1690csum_error:
1691 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1da177e4 1692bad_packet:
63231bdd 1693 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1da177e4 1694 } else {
cfb6eeb4 1695 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
1696 }
1697
1698discard_it:
1699 /* Discard frame. */
1700 kfree_skb(skb);
e905a9ed 1701 return 0;
1da177e4
LT
1702
1703discard_and_relse:
1704 sock_put(sk);
1705 goto discard_it;
1706
1707do_time_wait:
1708 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 1709 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1710 goto discard_it;
1711 }
1712
6a5dc9e5 1713 if (skb->len < (th->doff << 2)) {
9469c7b4 1714 inet_twsk_put(inet_twsk(sk));
6a5dc9e5
ED
1715 goto bad_packet;
1716 }
1717 if (tcp_checksum_complete(skb)) {
1718 inet_twsk_put(inet_twsk(sk));
1719 goto csum_error;
1da177e4 1720 }
9469c7b4 1721 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 1722 case TCP_TW_SYN: {
c346dca1 1723 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
c67499c0 1724 &tcp_hashinfo,
da5e3630 1725 iph->saddr, th->source,
eddc9ec5 1726 iph->daddr, th->dest,
463c84b9 1727 inet_iif(skb));
1da177e4 1728 if (sk2) {
9469c7b4
YH
1729 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1730 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1731 sk = sk2;
1732 goto process;
1733 }
1734 /* Fall through to ACK */
1735 }
1736 case TCP_TW_ACK:
1737 tcp_v4_timewait_ack(sk, skb);
1738 break;
1739 case TCP_TW_RST:
1740 goto no_tcp_socket;
1741 case TCP_TW_SUCCESS:;
1742 }
1743 goto discard_it;
1744}
1745
ccb7c410
DM
1746static struct timewait_sock_ops tcp_timewait_sock_ops = {
1747 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1748 .twsk_unique = tcp_twsk_unique,
1749 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 1750};
1da177e4 1751
63d02d15 1752void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
1753{
1754 struct dst_entry *dst = skb_dst(skb);
1755
ca777eff
ED
1756 if (dst) {
1757 dst_hold(dst);
1758 sk->sk_rx_dst = dst;
1759 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1760 }
5d299f3d 1761}
63d02d15 1762EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 1763
3b401a81 1764const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
1765 .queue_xmit = ip_queue_xmit,
1766 .send_check = tcp_v4_send_check,
1767 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 1768 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
1769 .conn_request = tcp_v4_conn_request,
1770 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
1771 .net_header_len = sizeof(struct iphdr),
1772 .setsockopt = ip_setsockopt,
1773 .getsockopt = ip_getsockopt,
1774 .addr2sockaddr = inet_csk_addr2sockaddr,
1775 .sockaddr_len = sizeof(struct sockaddr_in),
ab1e0a13 1776 .bind_conflict = inet_csk_bind_conflict,
3fdadf7d 1777#ifdef CONFIG_COMPAT
543d9cfe
ACM
1778 .compat_setsockopt = compat_ip_setsockopt,
1779 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 1780#endif
4fab9071 1781 .mtu_reduced = tcp_v4_mtu_reduced,
1da177e4 1782};
4bc2f18b 1783EXPORT_SYMBOL(ipv4_specific);
1da177e4 1784
cfb6eeb4 1785#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1786static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 1787 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 1788 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 1789 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 1790};
b6332e6c 1791#endif
cfb6eeb4 1792
1da177e4
LT
1793/* NOTE: A lot of things set to zero explicitly by call to
1794 * sk_alloc() so need not be done here.
1795 */
1796static int tcp_v4_init_sock(struct sock *sk)
1797{
6687e988 1798 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 1799
900f65d3 1800 tcp_init_sock(sk);
1da177e4 1801
8292a17a 1802 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 1803
cfb6eeb4 1804#ifdef CONFIG_TCP_MD5SIG
ac807fa8 1805 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 1806#endif
1da177e4 1807
1da177e4
LT
1808 return 0;
1809}
1810
7d06b2e0 1811void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
1812{
1813 struct tcp_sock *tp = tcp_sk(sk);
1814
1815 tcp_clear_xmit_timers(sk);
1816
6687e988 1817 tcp_cleanup_congestion_control(sk);
317a76f9 1818
1da177e4 1819 /* Cleanup up the write buffer. */
fe067e8a 1820 tcp_write_queue_purge(sk);
1da177e4
LT
1821
1822 /* Cleans up our, hopefully empty, out_of_order_queue. */
e905a9ed 1823 __skb_queue_purge(&tp->out_of_order_queue);
1da177e4 1824
cfb6eeb4
YH
1825#ifdef CONFIG_TCP_MD5SIG
1826 /* Clean up the MD5 key list, if any */
1827 if (tp->md5sig_info) {
a915da9b 1828 tcp_clear_md5_list(sk);
a8afca03 1829 kfree_rcu(tp->md5sig_info, rcu);
cfb6eeb4
YH
1830 tp->md5sig_info = NULL;
1831 }
1832#endif
1a2449a8 1833
1da177e4
LT
1834 /* Clean prequeue, it must be empty really */
1835 __skb_queue_purge(&tp->ucopy.prequeue);
1836
1837 /* Clean up a referenced TCP bind bucket. */
463c84b9 1838 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 1839 inet_put_port(sk);
1da177e4 1840
168a8f58 1841 BUG_ON(tp->fastopen_rsk != NULL);
435cf559 1842
cf60af03
YC
1843 /* If socket is aborted during connect operation */
1844 tcp_free_fastopen_req(tp);
1845
180d8cd9 1846 sk_sockets_allocated_dec(sk);
d1a4c0b3 1847 sock_release_memcg(sk);
1da177e4 1848}
1da177e4
LT
1849EXPORT_SYMBOL(tcp_v4_destroy_sock);
1850
1851#ifdef CONFIG_PROC_FS
1852/* Proc filesystem TCP sock list dumping. */
1853
a8b690f9
TH
1854/*
1855 * Get next listener socket follow cur. If cur is NULL, get first socket
1856 * starting from bucket given in st->bucket; when st->bucket is zero the
1857 * very first socket in the hash table is returned.
1858 */
1da177e4
LT
1859static void *listening_get_next(struct seq_file *seq, void *cur)
1860{
463c84b9 1861 struct inet_connection_sock *icsk;
c25eb3bf 1862 struct hlist_nulls_node *node;
1da177e4 1863 struct sock *sk = cur;
5caea4ea 1864 struct inet_listen_hashbucket *ilb;
5799de0b 1865 struct tcp_iter_state *st = seq->private;
a4146b1b 1866 struct net *net = seq_file_net(seq);
1da177e4
LT
1867
1868 if (!sk) {
a8b690f9 1869 ilb = &tcp_hashinfo.listening_hash[st->bucket];
5caea4ea 1870 spin_lock_bh(&ilb->lock);
c25eb3bf 1871 sk = sk_nulls_head(&ilb->head);
a8b690f9 1872 st->offset = 0;
1da177e4
LT
1873 goto get_sk;
1874 }
5caea4ea 1875 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 1876 ++st->num;
a8b690f9 1877 ++st->offset;
1da177e4
LT
1878
1879 if (st->state == TCP_SEQ_STATE_OPENREQ) {
60236fdd 1880 struct request_sock *req = cur;
1da177e4 1881
72a3effa 1882 icsk = inet_csk(st->syn_wait_sk);
1da177e4
LT
1883 req = req->dl_next;
1884 while (1) {
1885 while (req) {
bdccc4ca 1886 if (req->rsk_ops->family == st->family) {
1da177e4
LT
1887 cur = req;
1888 goto out;
1889 }
1890 req = req->dl_next;
1891 }
72a3effa 1892 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1da177e4
LT
1893 break;
1894get_req:
463c84b9 1895 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1da177e4 1896 }
1bde5ac4 1897 sk = sk_nulls_next(st->syn_wait_sk);
1da177e4 1898 st->state = TCP_SEQ_STATE_LISTENING;
463c84b9 1899 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 1900 } else {
e905a9ed 1901 icsk = inet_csk(sk);
463c84b9
ACM
1902 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1903 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1da177e4 1904 goto start_req;
463c84b9 1905 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1bde5ac4 1906 sk = sk_nulls_next(sk);
1da177e4
LT
1907 }
1908get_sk:
c25eb3bf 1909 sk_nulls_for_each_from(sk, node) {
8475ef9f
PE
1910 if (!net_eq(sock_net(sk), net))
1911 continue;
1912 if (sk->sk_family == st->family) {
1da177e4
LT
1913 cur = sk;
1914 goto out;
1915 }
e905a9ed 1916 icsk = inet_csk(sk);
463c84b9
ACM
1917 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1918 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1da177e4
LT
1919start_req:
1920 st->uid = sock_i_uid(sk);
1921 st->syn_wait_sk = sk;
1922 st->state = TCP_SEQ_STATE_OPENREQ;
1923 st->sbucket = 0;
1924 goto get_req;
1925 }
463c84b9 1926 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 1927 }
5caea4ea 1928 spin_unlock_bh(&ilb->lock);
a8b690f9 1929 st->offset = 0;
0f7ff927 1930 if (++st->bucket < INET_LHTABLE_SIZE) {
5caea4ea
ED
1931 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1932 spin_lock_bh(&ilb->lock);
c25eb3bf 1933 sk = sk_nulls_head(&ilb->head);
1da177e4
LT
1934 goto get_sk;
1935 }
1936 cur = NULL;
1937out:
1938 return cur;
1939}
1940
1941static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1942{
a8b690f9
TH
1943 struct tcp_iter_state *st = seq->private;
1944 void *rc;
1945
1946 st->bucket = 0;
1947 st->offset = 0;
1948 rc = listening_get_next(seq, NULL);
1da177e4
LT
1949
1950 while (rc && *pos) {
1951 rc = listening_get_next(seq, rc);
1952 --*pos;
1953 }
1954 return rc;
1955}
1956
05dbc7b5 1957static inline bool empty_bucket(const struct tcp_iter_state *st)
6eac5604 1958{
05dbc7b5 1959 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
6eac5604
AK
1960}
1961
a8b690f9
TH
1962/*
1963 * Get first established socket starting from bucket given in st->bucket.
1964 * If st->bucket is zero, the very first socket in the hash is returned.
1965 */
1da177e4
LT
1966static void *established_get_first(struct seq_file *seq)
1967{
5799de0b 1968 struct tcp_iter_state *st = seq->private;
a4146b1b 1969 struct net *net = seq_file_net(seq);
1da177e4
LT
1970 void *rc = NULL;
1971
a8b690f9
TH
1972 st->offset = 0;
1973 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 1974 struct sock *sk;
3ab5aee7 1975 struct hlist_nulls_node *node;
9db66bdc 1976 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 1977
6eac5604
AK
1978 /* Lockless fast path for the common case of empty buckets */
1979 if (empty_bucket(st))
1980 continue;
1981
9db66bdc 1982 spin_lock_bh(lock);
3ab5aee7 1983 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
f40c8174 1984 if (sk->sk_family != st->family ||
878628fb 1985 !net_eq(sock_net(sk), net)) {
1da177e4
LT
1986 continue;
1987 }
1988 rc = sk;
1989 goto out;
1990 }
9db66bdc 1991 spin_unlock_bh(lock);
1da177e4
LT
1992 }
1993out:
1994 return rc;
1995}
1996
1997static void *established_get_next(struct seq_file *seq, void *cur)
1998{
1999 struct sock *sk = cur;
3ab5aee7 2000 struct hlist_nulls_node *node;
5799de0b 2001 struct tcp_iter_state *st = seq->private;
a4146b1b 2002 struct net *net = seq_file_net(seq);
1da177e4
LT
2003
2004 ++st->num;
a8b690f9 2005 ++st->offset;
1da177e4 2006
05dbc7b5 2007 sk = sk_nulls_next(sk);
1da177e4 2008
3ab5aee7 2009 sk_nulls_for_each_from(sk, node) {
878628fb 2010 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
05dbc7b5 2011 return sk;
1da177e4
LT
2012 }
2013
05dbc7b5
ED
2014 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2015 ++st->bucket;
2016 return established_get_first(seq);
1da177e4
LT
2017}
2018
2019static void *established_get_idx(struct seq_file *seq, loff_t pos)
2020{
a8b690f9
TH
2021 struct tcp_iter_state *st = seq->private;
2022 void *rc;
2023
2024 st->bucket = 0;
2025 rc = established_get_first(seq);
1da177e4
LT
2026
2027 while (rc && pos) {
2028 rc = established_get_next(seq, rc);
2029 --pos;
7174259e 2030 }
1da177e4
LT
2031 return rc;
2032}
2033
2034static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2035{
2036 void *rc;
5799de0b 2037 struct tcp_iter_state *st = seq->private;
1da177e4 2038
1da177e4
LT
2039 st->state = TCP_SEQ_STATE_LISTENING;
2040 rc = listening_get_idx(seq, &pos);
2041
2042 if (!rc) {
1da177e4
LT
2043 st->state = TCP_SEQ_STATE_ESTABLISHED;
2044 rc = established_get_idx(seq, pos);
2045 }
2046
2047 return rc;
2048}
2049
a8b690f9
TH
2050static void *tcp_seek_last_pos(struct seq_file *seq)
2051{
2052 struct tcp_iter_state *st = seq->private;
2053 int offset = st->offset;
2054 int orig_num = st->num;
2055 void *rc = NULL;
2056
2057 switch (st->state) {
2058 case TCP_SEQ_STATE_OPENREQ:
2059 case TCP_SEQ_STATE_LISTENING:
2060 if (st->bucket >= INET_LHTABLE_SIZE)
2061 break;
2062 st->state = TCP_SEQ_STATE_LISTENING;
2063 rc = listening_get_next(seq, NULL);
2064 while (offset-- && rc)
2065 rc = listening_get_next(seq, rc);
2066 if (rc)
2067 break;
2068 st->bucket = 0;
05dbc7b5 2069 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2070 /* Fallthrough */
2071 case TCP_SEQ_STATE_ESTABLISHED:
a8b690f9
TH
2072 if (st->bucket > tcp_hashinfo.ehash_mask)
2073 break;
2074 rc = established_get_first(seq);
2075 while (offset-- && rc)
2076 rc = established_get_next(seq, rc);
2077 }
2078
2079 st->num = orig_num;
2080
2081 return rc;
2082}
2083
1da177e4
LT
2084static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2085{
5799de0b 2086 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2087 void *rc;
2088
2089 if (*pos && *pos == st->last_pos) {
2090 rc = tcp_seek_last_pos(seq);
2091 if (rc)
2092 goto out;
2093 }
2094
1da177e4
LT
2095 st->state = TCP_SEQ_STATE_LISTENING;
2096 st->num = 0;
a8b690f9
TH
2097 st->bucket = 0;
2098 st->offset = 0;
2099 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2100
2101out:
2102 st->last_pos = *pos;
2103 return rc;
1da177e4
LT
2104}
2105
2106static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2107{
a8b690f9 2108 struct tcp_iter_state *st = seq->private;
1da177e4 2109 void *rc = NULL;
1da177e4
LT
2110
2111 if (v == SEQ_START_TOKEN) {
2112 rc = tcp_get_idx(seq, 0);
2113 goto out;
2114 }
1da177e4
LT
2115
2116 switch (st->state) {
2117 case TCP_SEQ_STATE_OPENREQ:
2118 case TCP_SEQ_STATE_LISTENING:
2119 rc = listening_get_next(seq, v);
2120 if (!rc) {
1da177e4 2121 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2122 st->bucket = 0;
2123 st->offset = 0;
1da177e4
LT
2124 rc = established_get_first(seq);
2125 }
2126 break;
2127 case TCP_SEQ_STATE_ESTABLISHED:
1da177e4
LT
2128 rc = established_get_next(seq, v);
2129 break;
2130 }
2131out:
2132 ++*pos;
a8b690f9 2133 st->last_pos = *pos;
1da177e4
LT
2134 return rc;
2135}
2136
2137static void tcp_seq_stop(struct seq_file *seq, void *v)
2138{
5799de0b 2139 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2140
2141 switch (st->state) {
2142 case TCP_SEQ_STATE_OPENREQ:
2143 if (v) {
463c84b9
ACM
2144 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2145 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4
LT
2146 }
2147 case TCP_SEQ_STATE_LISTENING:
2148 if (v != SEQ_START_TOKEN)
5caea4ea 2149 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4 2150 break;
1da177e4
LT
2151 case TCP_SEQ_STATE_ESTABLISHED:
2152 if (v)
9db66bdc 2153 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2154 break;
2155 }
2156}
2157
73cb88ec 2158int tcp_seq_open(struct inode *inode, struct file *file)
1da177e4 2159{
d9dda78b 2160 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
1da177e4 2161 struct tcp_iter_state *s;
52d6f3f1 2162 int err;
1da177e4 2163
52d6f3f1
DL
2164 err = seq_open_net(inode, file, &afinfo->seq_ops,
2165 sizeof(struct tcp_iter_state));
2166 if (err < 0)
2167 return err;
f40c8174 2168
52d6f3f1 2169 s = ((struct seq_file *)file->private_data)->private;
1da177e4 2170 s->family = afinfo->family;
688d1945 2171 s->last_pos = 0;
f40c8174
DL
2172 return 0;
2173}
73cb88ec 2174EXPORT_SYMBOL(tcp_seq_open);
f40c8174 2175
6f8b13bc 2176int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4
LT
2177{
2178 int rc = 0;
2179 struct proc_dir_entry *p;
2180
9427c4b3
DL
2181 afinfo->seq_ops.start = tcp_seq_start;
2182 afinfo->seq_ops.next = tcp_seq_next;
2183 afinfo->seq_ops.stop = tcp_seq_stop;
2184
84841c3c 2185 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
73cb88ec 2186 afinfo->seq_fops, afinfo);
84841c3c 2187 if (!p)
1da177e4
LT
2188 rc = -ENOMEM;
2189 return rc;
2190}
4bc2f18b 2191EXPORT_SYMBOL(tcp_proc_register);
1da177e4 2192
6f8b13bc 2193void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4 2194{
ece31ffd 2195 remove_proc_entry(afinfo->name, net->proc_net);
1da177e4 2196}
4bc2f18b 2197EXPORT_SYMBOL(tcp_proc_unregister);
1da177e4 2198
cf533ea5 2199static void get_openreq4(const struct sock *sk, const struct request_sock *req,
652586df 2200 struct seq_file *f, int i, kuid_t uid)
1da177e4 2201{
2e6599cb 2202 const struct inet_request_sock *ireq = inet_rsk(req);
a399a805 2203 long delta = req->expires - jiffies;
1da177e4 2204
5e659e4c 2205 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2206 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
1da177e4 2207 i,
634fb979 2208 ireq->ir_loc_addr,
c720c7e8 2209 ntohs(inet_sk(sk)->inet_sport),
634fb979
ED
2210 ireq->ir_rmt_addr,
2211 ntohs(ireq->ir_rmt_port),
1da177e4
LT
2212 TCP_SYN_RECV,
2213 0, 0, /* could print option size, but that is af dependent. */
2214 1, /* timers active (only the expire timer) */
a399a805 2215 jiffies_delta_to_clock_t(delta),
e6c022a4 2216 req->num_timeout,
a7cb5a49 2217 from_kuid_munged(seq_user_ns(f), uid),
1da177e4
LT
2218 0, /* non standard timer */
2219 0, /* open_requests have no inode */
2220 atomic_read(&sk->sk_refcnt),
652586df 2221 req);
1da177e4
LT
2222}
2223
652586df 2224static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
1da177e4
LT
2225{
2226 int timer_active;
2227 unsigned long timer_expires;
cf533ea5 2228 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2229 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2230 const struct inet_sock *inet = inet_sk(sk);
168a8f58 2231 struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2232 __be32 dest = inet->inet_daddr;
2233 __be32 src = inet->inet_rcv_saddr;
2234 __u16 destp = ntohs(inet->inet_dport);
2235 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2236 int rx_queue;
1da177e4 2237
6ba8a3b1
ND
2238 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2239 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2240 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2241 timer_active = 1;
463c84b9
ACM
2242 timer_expires = icsk->icsk_timeout;
2243 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2244 timer_active = 4;
463c84b9 2245 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2246 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2247 timer_active = 2;
cf4c6bf8 2248 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2249 } else {
2250 timer_active = 0;
2251 timer_expires = jiffies;
2252 }
2253
49d09007
ED
2254 if (sk->sk_state == TCP_LISTEN)
2255 rx_queue = sk->sk_ack_backlog;
2256 else
2257 /*
2258 * because we dont lock socket, we might find a transient negative value
2259 */
2260 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2261
5e659e4c 2262 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
652586df 2263 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
cf4c6bf8 2264 i, src, srcp, dest, destp, sk->sk_state,
47da8ee6 2265 tp->write_seq - tp->snd_una,
49d09007 2266 rx_queue,
1da177e4 2267 timer_active,
a399a805 2268 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2269 icsk->icsk_retransmits,
a7cb5a49 2270 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2271 icsk->icsk_probes_out,
cf4c6bf8
IJ
2272 sock_i_ino(sk),
2273 atomic_read(&sk->sk_refcnt), sk,
7be87351
SH
2274 jiffies_to_clock_t(icsk->icsk_rto),
2275 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2276 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2277 tp->snd_cwnd,
168a8f58
JC
2278 sk->sk_state == TCP_LISTEN ?
2279 (fastopenq ? fastopenq->max_qlen : 0) :
652586df 2280 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
1da177e4
LT
2281}
2282
cf533ea5 2283static void get_timewait4_sock(const struct inet_timewait_sock *tw,
652586df 2284 struct seq_file *f, int i)
1da177e4 2285{
23f33c2d 2286 __be32 dest, src;
1da177e4 2287 __u16 destp, srcp;
e2a1d3e4 2288 s32 delta = tw->tw_ttd - inet_tw_time_stamp();
1da177e4
LT
2289
2290 dest = tw->tw_daddr;
2291 src = tw->tw_rcv_saddr;
2292 destp = ntohs(tw->tw_dport);
2293 srcp = ntohs(tw->tw_sport);
2294
5e659e4c 2295 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2296 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
1da177e4 2297 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2298 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
652586df 2299 atomic_read(&tw->tw_refcnt), tw);
1da177e4
LT
2300}
2301
2302#define TMPSZ 150
2303
2304static int tcp4_seq_show(struct seq_file *seq, void *v)
2305{
5799de0b 2306 struct tcp_iter_state *st;
05dbc7b5 2307 struct sock *sk = v;
1da177e4 2308
652586df 2309 seq_setwidth(seq, TMPSZ - 1);
1da177e4 2310 if (v == SEQ_START_TOKEN) {
652586df 2311 seq_puts(seq, " sl local_address rem_address st tx_queue "
1da177e4
LT
2312 "rx_queue tr tm->when retrnsmt uid timeout "
2313 "inode");
2314 goto out;
2315 }
2316 st = seq->private;
2317
2318 switch (st->state) {
2319 case TCP_SEQ_STATE_LISTENING:
2320 case TCP_SEQ_STATE_ESTABLISHED:
05dbc7b5 2321 if (sk->sk_state == TCP_TIME_WAIT)
652586df 2322 get_timewait4_sock(v, seq, st->num);
05dbc7b5 2323 else
652586df 2324 get_tcp4_sock(v, seq, st->num);
1da177e4
LT
2325 break;
2326 case TCP_SEQ_STATE_OPENREQ:
652586df 2327 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
1da177e4
LT
2328 break;
2329 }
1da177e4 2330out:
652586df 2331 seq_pad(seq, '\n');
1da177e4
LT
2332 return 0;
2333}
2334
73cb88ec
AV
2335static const struct file_operations tcp_afinfo_seq_fops = {
2336 .owner = THIS_MODULE,
2337 .open = tcp_seq_open,
2338 .read = seq_read,
2339 .llseek = seq_lseek,
2340 .release = seq_release_net
2341};
2342
1da177e4 2343static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4
LT
2344 .name = "tcp",
2345 .family = AF_INET,
73cb88ec 2346 .seq_fops = &tcp_afinfo_seq_fops,
9427c4b3
DL
2347 .seq_ops = {
2348 .show = tcp4_seq_show,
2349 },
1da177e4
LT
2350};
2351
2c8c1e72 2352static int __net_init tcp4_proc_init_net(struct net *net)
757764f6
PE
2353{
2354 return tcp_proc_register(net, &tcp4_seq_afinfo);
2355}
2356
2c8c1e72 2357static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6
PE
2358{
2359 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2360}
2361
2362static struct pernet_operations tcp4_net_ops = {
2363 .init = tcp4_proc_init_net,
2364 .exit = tcp4_proc_exit_net,
2365};
2366
1da177e4
LT
2367int __init tcp4_proc_init(void)
2368{
757764f6 2369 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2370}
2371
2372void tcp4_proc_exit(void)
2373{
757764f6 2374 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2375}
2376#endif /* CONFIG_PROC_FS */
2377
2378struct proto tcp_prot = {
2379 .name = "TCP",
2380 .owner = THIS_MODULE,
2381 .close = tcp_close,
2382 .connect = tcp_v4_connect,
2383 .disconnect = tcp_disconnect,
463c84b9 2384 .accept = inet_csk_accept,
1da177e4
LT
2385 .ioctl = tcp_ioctl,
2386 .init = tcp_v4_init_sock,
2387 .destroy = tcp_v4_destroy_sock,
2388 .shutdown = tcp_shutdown,
2389 .setsockopt = tcp_setsockopt,
2390 .getsockopt = tcp_getsockopt,
1da177e4 2391 .recvmsg = tcp_recvmsg,
7ba42910
CG
2392 .sendmsg = tcp_sendmsg,
2393 .sendpage = tcp_sendpage,
1da177e4 2394 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 2395 .release_cb = tcp_release_cb,
ab1e0a13
ACM
2396 .hash = inet_hash,
2397 .unhash = inet_unhash,
2398 .get_port = inet_csk_get_port,
1da177e4 2399 .enter_memory_pressure = tcp_enter_memory_pressure,
c9bee3b7 2400 .stream_memory_free = tcp_stream_memory_free,
1da177e4 2401 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2402 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2403 .memory_allocated = &tcp_memory_allocated,
2404 .memory_pressure = &tcp_memory_pressure,
a4fe34bf 2405 .sysctl_mem = sysctl_tcp_mem,
1da177e4
LT
2406 .sysctl_wmem = sysctl_tcp_wmem,
2407 .sysctl_rmem = sysctl_tcp_rmem,
2408 .max_header = MAX_TCP_HEADER,
2409 .obj_size = sizeof(struct tcp_sock),
3ab5aee7 2410 .slab_flags = SLAB_DESTROY_BY_RCU,
6d6ee43e 2411 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2412 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2413 .h.hashinfo = &tcp_hashinfo,
7ba42910 2414 .no_autobind = true,
543d9cfe
ACM
2415#ifdef CONFIG_COMPAT
2416 .compat_setsockopt = compat_tcp_setsockopt,
2417 .compat_getsockopt = compat_tcp_getsockopt,
2418#endif
c255a458 2419#ifdef CONFIG_MEMCG_KMEM
d1a4c0b3
GC
2420 .init_cgroup = tcp_init_cgroup,
2421 .destroy_cgroup = tcp_destroy_cgroup,
2422 .proto_cgroup = tcp_proto_cgroup,
2423#endif
1da177e4 2424};
4bc2f18b 2425EXPORT_SYMBOL(tcp_prot);
1da177e4 2426
046ee902
DL
2427static int __net_init tcp_sk_init(struct net *net)
2428{
5d134f1c 2429 net->ipv4.sysctl_tcp_ecn = 2;
be9f4a44 2430 return 0;
046ee902
DL
2431}
2432
2433static void __net_exit tcp_sk_exit(struct net *net)
2434{
b099ce26
EB
2435}
2436
2437static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2438{
2439 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
046ee902
DL
2440}
2441
2442static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2443 .init = tcp_sk_init,
2444 .exit = tcp_sk_exit,
2445 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2446};
2447
9b0f976f 2448void __init tcp_v4_init(void)
1da177e4 2449{
5caea4ea 2450 inet_hashinfo_init(&tcp_hashinfo);
6a1b3054 2451 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2452 panic("Failed to create the TCP control socket.\n");
1da177e4 2453}