2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
37 * request_sock handling and moved
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
40 * Added new listen semantics.
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
53 #define pr_fmt(fmt) "TCP: " fmt
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
66 #include <net/net_namespace.h>
68 #include <net/inet_hashtables.h>
70 #include <net/transp_v6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
75 #include <net/secure_seq.h>
76 #include <net/tcp_memcontrol.h>
77 #include <net/busy_poll.h>
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
92 #ifdef CONFIG_TCP_MD5SIG
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
94 __be32 daddr, __be32 saddr, const struct tcphdr *th);
97 struct inet_hashinfo tcp_hashinfo;
98 EXPORT_SYMBOL(tcp_hashinfo);
100 static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
102 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
105 tcp_hdr(skb)->source);
108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
113 /* With PAWS, it is safe from the viewpoint
114 of data integrity. Even without PAWS it is safe provided sequence
115 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
117 Actually, the idea is close to VJ's one, only timestamp cache is
118 held not per host, but per port pair and TW bucket is used as state
121 If TW bucket has been already destroyed we fall back to VJ's scheme
122 and use initial timestamp retrieved from peer table.
124 if (tcptw->tw_ts_recent_stamp &&
125 (twp == NULL || (sysctl_tcp_tw_reuse &&
126 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
127 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128 if (tp->write_seq == 0)
130 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
131 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
140 /* This will initiate an outgoing connection. */
141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
143 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
144 struct inet_sock *inet = inet_sk(sk);
145 struct tcp_sock *tp = tcp_sk(sk);
146 __be16 orig_sport, orig_dport;
147 __be32 daddr, nexthop;
151 struct ip_options_rcu *inet_opt;
153 if (addr_len < sizeof(struct sockaddr_in))
156 if (usin->sin_family != AF_INET)
157 return -EAFNOSUPPORT;
159 nexthop = daddr = usin->sin_addr.s_addr;
160 inet_opt = rcu_dereference_protected(inet->inet_opt,
161 sock_owned_by_user(sk));
162 if (inet_opt && inet_opt->opt.srr) {
165 nexthop = inet_opt->opt.faddr;
168 orig_sport = inet->inet_sport;
169 orig_dport = usin->sin_port;
170 fl4 = &inet->cork.fl.u.ip4;
171 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
172 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
174 orig_sport, orig_dport, sk);
177 if (err == -ENETUNREACH)
178 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
182 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
187 if (!inet_opt || !inet_opt->opt.srr)
190 if (!inet->inet_saddr)
191 inet->inet_saddr = fl4->saddr;
192 sk_rcv_saddr_set(sk, inet->inet_saddr);
194 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
195 /* Reset inherited state */
196 tp->rx_opt.ts_recent = 0;
197 tp->rx_opt.ts_recent_stamp = 0;
198 if (likely(!tp->repair))
202 if (tcp_death_row.sysctl_tw_recycle &&
203 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
204 tcp_fetch_timewait_stamp(sk, &rt->dst);
206 inet->inet_dport = usin->sin_port;
207 sk_daddr_set(sk, daddr);
209 inet_csk(sk)->icsk_ext_hdr_len = 0;
211 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
213 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
215 /* Socket identity is still unknown (sport may be zero).
216 * However we set state to SYN-SENT and not releasing socket
217 * lock select source port, enter ourselves into the hash tables and
218 * complete initialization after this.
220 tcp_set_state(sk, TCP_SYN_SENT);
221 err = inet_hash_connect(&tcp_death_row, sk);
227 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228 inet->inet_sport, inet->inet_dport, sk);
234 /* OK, now commit destination to socket. */
235 sk->sk_gso_type = SKB_GSO_TCPV4;
236 sk_setup_caps(sk, &rt->dst);
238 if (!tp->write_seq && likely(!tp->repair))
239 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
244 inet->inet_id = tp->write_seq ^ jiffies;
246 err = tcp_connect(sk);
256 * This unhashes the socket and releases the local port,
259 tcp_set_state(sk, TCP_CLOSE);
261 sk->sk_route_caps = 0;
262 inet->inet_dport = 0;
265 EXPORT_SYMBOL(tcp_v4_connect);
268 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269 * It can be called through tcp_release_cb() if socket was owned by user
270 * at the time tcp_v4_err() was called to handle ICMP message.
272 void tcp_v4_mtu_reduced(struct sock *sk)
274 struct dst_entry *dst;
275 struct inet_sock *inet = inet_sk(sk);
276 u32 mtu = tcp_sk(sk)->mtu_info;
278 dst = inet_csk_update_pmtu(sk, mtu);
282 /* Something is about to be wrong... Remember soft error
283 * for the case, if this connection will not able to recover.
285 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286 sk->sk_err_soft = EMSGSIZE;
290 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
291 ip_sk_accept_pmtu(sk) &&
292 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
293 tcp_sync_mss(sk, mtu);
295 /* Resend the TCP packet because it's
296 * clear that the old packet has been
297 * dropped. This is the new "fast" path mtu
300 tcp_simple_retransmit(sk);
301 } /* else let the usual retransmit timer handle it */
303 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
305 static void do_redirect(struct sk_buff *skb, struct sock *sk)
307 struct dst_entry *dst = __sk_dst_check(sk, 0);
310 dst->ops->redirect(dst, sk, skb);
314 * This routine is called by the ICMP module when it gets some
315 * sort of error condition. If err < 0 then the socket should
316 * be closed and the error returned to the user. If err > 0
317 * it's just the icmp type << 8 | icmp code. After adjustment
318 * header points to the first 8 bytes of the tcp header. We need
319 * to find the appropriate port.
321 * The locking strategy used here is very "optimistic". When
322 * someone else accesses the socket the ICMP is just dropped
323 * and for some paths there is no check at all.
324 * A more general error queue to queue errors for later handling
325 * is probably better.
329 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
331 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
332 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
333 struct inet_connection_sock *icsk;
335 struct inet_sock *inet;
336 const int type = icmp_hdr(icmp_skb)->type;
337 const int code = icmp_hdr(icmp_skb)->code;
340 struct request_sock *fastopen;
344 struct net *net = dev_net(icmp_skb->dev);
346 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
347 iph->saddr, th->source, inet_iif(icmp_skb));
349 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
352 if (sk->sk_state == TCP_TIME_WAIT) {
353 inet_twsk_put(inet_twsk(sk));
358 /* If too many ICMPs get dropped on busy
359 * servers this needs to be solved differently.
360 * We do take care of PMTU discovery (RFC1191) special case :
361 * we can receive locally generated ICMP messages while socket is held.
363 if (sock_owned_by_user(sk)) {
364 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
365 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
367 if (sk->sk_state == TCP_CLOSE)
370 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
371 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
377 seq = ntohl(th->seq);
378 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
379 fastopen = tp->fastopen_rsk;
380 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
381 if (sk->sk_state != TCP_LISTEN &&
382 !between(seq, snd_una, tp->snd_nxt)) {
383 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
389 do_redirect(icmp_skb, sk);
391 case ICMP_SOURCE_QUENCH:
392 /* Just silently ignore these. */
394 case ICMP_PARAMETERPROB:
397 case ICMP_DEST_UNREACH:
398 if (code > NR_ICMP_UNREACH)
401 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
402 /* We are not interested in TCP_LISTEN and open_requests
403 * (SYN-ACKs send out by Linux are always <576bytes so
404 * they should go through unfragmented).
406 if (sk->sk_state == TCP_LISTEN)
410 if (!sock_owned_by_user(sk)) {
411 tcp_v4_mtu_reduced(sk);
413 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
419 err = icmp_err_convert[code].errno;
420 /* check if icmp_skb allows revert of backoff
421 * (see draft-zimmermann-tcp-lcd) */
422 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
424 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
425 !icsk->icsk_backoff || fastopen)
428 if (sock_owned_by_user(sk))
431 icsk->icsk_backoff--;
432 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
434 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
436 skb = tcp_write_queue_head(sk);
439 remaining = icsk->icsk_rto -
441 tcp_time_stamp - tcp_skb_timestamp(skb));
444 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
445 remaining, TCP_RTO_MAX);
447 /* RTO revert clocked out retransmission.
448 * Will retransmit now */
449 tcp_retransmit_timer(sk);
453 case ICMP_TIME_EXCEEDED:
460 switch (sk->sk_state) {
461 struct request_sock *req;
463 if (sock_owned_by_user(sk))
466 req = inet_csk_search_req(sk, th->dest,
467 iph->daddr, iph->saddr);
471 /* ICMPs are not backlogged, hence we cannot get
472 an established socket here.
476 if (seq != tcp_rsk(req)->snt_isn) {
477 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
483 * Still in SYN_RECV, just remove it silently.
484 * There is no good way to pass the error to the newly
485 * created socket, and POSIX does not want network
486 * errors returned from accept().
488 inet_csk_reqsk_queue_drop(sk, req);
489 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
495 /* Only in fast or simultaneous open. If a fast open socket is
496 * is already accepted it is treated as a connected one below.
498 if (fastopen && fastopen->sk == NULL)
501 if (!sock_owned_by_user(sk)) {
504 sk->sk_error_report(sk);
508 sk->sk_err_soft = err;
513 /* If we've already connected we will keep trying
514 * until we time out, or the user gives up.
516 * rfc1122 4.2.3.9 allows to consider as hard errors
517 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
518 * but it is obsoleted by pmtu discovery).
520 * Note, that in modern internet, where routing is unreliable
521 * and in each dark corner broken firewalls sit, sending random
522 * errors ordered by their masters even this two messages finally lose
523 * their original sense (even Linux sends invalid PORT_UNREACHs)
525 * Now we are in compliance with RFCs.
530 if (!sock_owned_by_user(sk) && inet->recverr) {
532 sk->sk_error_report(sk);
533 } else { /* Only an error on timeout */
534 sk->sk_err_soft = err;
542 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
544 struct tcphdr *th = tcp_hdr(skb);
546 if (skb->ip_summed == CHECKSUM_PARTIAL) {
547 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
548 skb->csum_start = skb_transport_header(skb) - skb->head;
549 skb->csum_offset = offsetof(struct tcphdr, check);
551 th->check = tcp_v4_check(skb->len, saddr, daddr,
558 /* This routine computes an IPv4 TCP checksum. */
559 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
561 const struct inet_sock *inet = inet_sk(sk);
563 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
565 EXPORT_SYMBOL(tcp_v4_send_check);
568 * This routine will send an RST to the other tcp.
570 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
572 * Answer: if a packet caused RST, it is not for a socket
573 * existing in our system, if it is matched to a socket,
574 * it is just duplicate segment or bug in other side's TCP.
575 * So that we build reply only basing on parameters
576 * arrived with segment.
577 * Exception: precedence violation. We do not implement it in any case.
580 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
582 const struct tcphdr *th = tcp_hdr(skb);
585 #ifdef CONFIG_TCP_MD5SIG
586 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
589 struct ip_reply_arg arg;
590 #ifdef CONFIG_TCP_MD5SIG
591 struct tcp_md5sig_key *key;
592 const __u8 *hash_location = NULL;
593 unsigned char newhash[16];
595 struct sock *sk1 = NULL;
599 /* Never send a reset in response to a reset. */
603 /* If sk not NULL, it means we did a successful lookup and incoming
604 * route had to be correct. prequeue might have dropped our dst.
606 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
609 /* Swap the send and the receive. */
610 memset(&rep, 0, sizeof(rep));
611 rep.th.dest = th->source;
612 rep.th.source = th->dest;
613 rep.th.doff = sizeof(struct tcphdr) / 4;
617 rep.th.seq = th->ack_seq;
620 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
621 skb->len - (th->doff << 2));
624 memset(&arg, 0, sizeof(arg));
625 arg.iov[0].iov_base = (unsigned char *)&rep;
626 arg.iov[0].iov_len = sizeof(rep.th);
628 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
629 #ifdef CONFIG_TCP_MD5SIG
630 hash_location = tcp_parse_md5sig_option(th);
631 if (!sk && hash_location) {
633 * active side is lost. Try to find listening socket through
634 * source port, and then find md5 key through listening socket.
635 * we are not loose security here:
636 * Incoming packet is checked with md5 hash with finding key,
637 * no RST generated if md5 hash doesn't match.
639 sk1 = __inet_lookup_listener(net,
640 &tcp_hashinfo, ip_hdr(skb)->saddr,
641 th->source, ip_hdr(skb)->daddr,
642 ntohs(th->source), inet_iif(skb));
643 /* don't send rst if it can't find key */
647 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
648 &ip_hdr(skb)->saddr, AF_INET);
652 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
653 if (genhash || memcmp(hash_location, newhash, 16) != 0)
656 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
662 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
664 (TCPOPT_MD5SIG << 8) |
666 /* Update length and the length the header thinks exists */
667 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
668 rep.th.doff = arg.iov[0].iov_len / 4;
670 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
671 key, ip_hdr(skb)->saddr,
672 ip_hdr(skb)->daddr, &rep.th);
675 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
676 ip_hdr(skb)->saddr, /* XXX */
677 arg.iov[0].iov_len, IPPROTO_TCP, 0);
678 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
679 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
680 /* When socket is gone, all binding information is lost.
681 * routing might fail in this case. No choice here, if we choose to force
682 * input interface, we will misroute in case of asymmetric route.
685 arg.bound_dev_if = sk->sk_bound_dev_if;
687 arg.tos = ip_hdr(skb)->tos;
688 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
689 skb, &TCP_SKB_CB(skb)->header.h4.opt,
690 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
691 &arg, arg.iov[0].iov_len);
693 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
694 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
696 #ifdef CONFIG_TCP_MD5SIG
705 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
706 outside socket context is ugly, certainly. What can I do?
709 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
710 u32 win, u32 tsval, u32 tsecr, int oif,
711 struct tcp_md5sig_key *key,
712 int reply_flags, u8 tos)
714 const struct tcphdr *th = tcp_hdr(skb);
717 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
718 #ifdef CONFIG_TCP_MD5SIG
719 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
723 struct ip_reply_arg arg;
724 struct net *net = dev_net(skb_dst(skb)->dev);
726 memset(&rep.th, 0, sizeof(struct tcphdr));
727 memset(&arg, 0, sizeof(arg));
729 arg.iov[0].iov_base = (unsigned char *)&rep;
730 arg.iov[0].iov_len = sizeof(rep.th);
732 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
733 (TCPOPT_TIMESTAMP << 8) |
735 rep.opt[1] = htonl(tsval);
736 rep.opt[2] = htonl(tsecr);
737 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
740 /* Swap the send and the receive. */
741 rep.th.dest = th->source;
742 rep.th.source = th->dest;
743 rep.th.doff = arg.iov[0].iov_len / 4;
744 rep.th.seq = htonl(seq);
745 rep.th.ack_seq = htonl(ack);
747 rep.th.window = htons(win);
749 #ifdef CONFIG_TCP_MD5SIG
751 int offset = (tsecr) ? 3 : 0;
753 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
755 (TCPOPT_MD5SIG << 8) |
757 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
758 rep.th.doff = arg.iov[0].iov_len/4;
760 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
761 key, ip_hdr(skb)->saddr,
762 ip_hdr(skb)->daddr, &rep.th);
765 arg.flags = reply_flags;
766 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
767 ip_hdr(skb)->saddr, /* XXX */
768 arg.iov[0].iov_len, IPPROTO_TCP, 0);
769 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
771 arg.bound_dev_if = oif;
773 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
774 skb, &TCP_SKB_CB(skb)->header.h4.opt,
775 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
776 &arg, arg.iov[0].iov_len);
778 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
781 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
783 struct inet_timewait_sock *tw = inet_twsk(sk);
784 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
786 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
787 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
788 tcp_time_stamp + tcptw->tw_ts_offset,
791 tcp_twsk_md5_key(tcptw),
792 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
799 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
800 struct request_sock *req)
802 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
803 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
805 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
806 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
807 tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
811 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
813 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
818 * Send a SYN-ACK after having received a SYN.
819 * This still operates on a request_sock only, not on a big
822 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
824 struct request_sock *req,
826 struct tcp_fastopen_cookie *foc)
828 const struct inet_request_sock *ireq = inet_rsk(req);
833 /* First, grab a route. */
834 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
837 skb = tcp_make_synack(sk, dst, req, foc);
840 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
842 skb_set_queue_mapping(skb, queue_mapping);
843 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
846 err = net_xmit_eval(err);
853 * IPv4 request_sock destructor.
855 static void tcp_v4_reqsk_destructor(struct request_sock *req)
857 kfree(inet_rsk(req)->opt);
861 * Return true if a syncookie should be sent
863 bool tcp_syn_flood_action(struct sock *sk,
864 const struct sk_buff *skb,
867 const char *msg = "Dropping request";
868 bool want_cookie = false;
869 struct listen_sock *lopt;
871 #ifdef CONFIG_SYN_COOKIES
872 if (sysctl_tcp_syncookies) {
873 msg = "Sending cookies";
875 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
878 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
880 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
881 if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
882 lopt->synflood_warned = 1;
883 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
884 proto, ntohs(tcp_hdr(skb)->dest), msg);
888 EXPORT_SYMBOL(tcp_syn_flood_action);
890 #ifdef CONFIG_TCP_MD5SIG
892 * RFC2385 MD5 checksumming requires a mapping of
893 * IP address->MD5 Key.
894 * We need to maintain these in the sk structure.
897 /* Find the Key structure for an address. */
898 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
899 const union tcp_md5_addr *addr,
902 struct tcp_sock *tp = tcp_sk(sk);
903 struct tcp_md5sig_key *key;
904 unsigned int size = sizeof(struct in_addr);
905 struct tcp_md5sig_info *md5sig;
907 /* caller either holds rcu_read_lock() or socket lock */
908 md5sig = rcu_dereference_check(tp->md5sig_info,
909 sock_owned_by_user(sk) ||
910 lockdep_is_held(&sk->sk_lock.slock));
913 #if IS_ENABLED(CONFIG_IPV6)
914 if (family == AF_INET6)
915 size = sizeof(struct in6_addr);
917 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
918 if (key->family != family)
920 if (!memcmp(&key->addr, addr, size))
925 EXPORT_SYMBOL(tcp_md5_do_lookup);
927 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
928 struct sock *addr_sk)
930 union tcp_md5_addr *addr;
932 addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
933 return tcp_md5_do_lookup(sk, addr, AF_INET);
935 EXPORT_SYMBOL(tcp_v4_md5_lookup);
937 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
938 struct request_sock *req)
940 union tcp_md5_addr *addr;
942 addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
943 return tcp_md5_do_lookup(sk, addr, AF_INET);
946 /* This can be called on a newly created socket, from other files */
947 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
948 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
950 /* Add Key to the list */
951 struct tcp_md5sig_key *key;
952 struct tcp_sock *tp = tcp_sk(sk);
953 struct tcp_md5sig_info *md5sig;
955 key = tcp_md5_do_lookup(sk, addr, family);
957 /* Pre-existing entry - just update that one. */
958 memcpy(key->key, newkey, newkeylen);
959 key->keylen = newkeylen;
963 md5sig = rcu_dereference_protected(tp->md5sig_info,
964 sock_owned_by_user(sk));
966 md5sig = kmalloc(sizeof(*md5sig), gfp);
970 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
971 INIT_HLIST_HEAD(&md5sig->head);
972 rcu_assign_pointer(tp->md5sig_info, md5sig);
975 key = sock_kmalloc(sk, sizeof(*key), gfp);
978 if (!tcp_alloc_md5sig_pool()) {
979 sock_kfree_s(sk, key, sizeof(*key));
983 memcpy(key->key, newkey, newkeylen);
984 key->keylen = newkeylen;
985 key->family = family;
986 memcpy(&key->addr, addr,
987 (family == AF_INET6) ? sizeof(struct in6_addr) :
988 sizeof(struct in_addr));
989 hlist_add_head_rcu(&key->node, &md5sig->head);
992 EXPORT_SYMBOL(tcp_md5_do_add);
994 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
996 struct tcp_md5sig_key *key;
998 key = tcp_md5_do_lookup(sk, addr, family);
1001 hlist_del_rcu(&key->node);
1002 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1003 kfree_rcu(key, rcu);
1006 EXPORT_SYMBOL(tcp_md5_do_del);
1008 static void tcp_clear_md5_list(struct sock *sk)
1010 struct tcp_sock *tp = tcp_sk(sk);
1011 struct tcp_md5sig_key *key;
1012 struct hlist_node *n;
1013 struct tcp_md5sig_info *md5sig;
1015 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1017 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1018 hlist_del_rcu(&key->node);
1019 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1020 kfree_rcu(key, rcu);
1024 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1027 struct tcp_md5sig cmd;
1028 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1030 if (optlen < sizeof(cmd))
1033 if (copy_from_user(&cmd, optval, sizeof(cmd)))
1036 if (sin->sin_family != AF_INET)
1039 if (!cmd.tcpm_keylen)
1040 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1043 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1046 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1047 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1051 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1052 __be32 daddr, __be32 saddr, int nbytes)
1054 struct tcp4_pseudohdr *bp;
1055 struct scatterlist sg;
1057 bp = &hp->md5_blk.ip4;
1060 * 1. the TCP pseudo-header (in the order: source IP address,
1061 * destination IP address, zero-padded protocol number, and
1067 bp->protocol = IPPROTO_TCP;
1068 bp->len = cpu_to_be16(nbytes);
1070 sg_init_one(&sg, bp, sizeof(*bp));
1071 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1074 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1075 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1077 struct tcp_md5sig_pool *hp;
1078 struct hash_desc *desc;
1080 hp = tcp_get_md5sig_pool();
1082 goto clear_hash_noput;
1083 desc = &hp->md5_desc;
1085 if (crypto_hash_init(desc))
1087 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1089 if (tcp_md5_hash_header(hp, th))
1091 if (tcp_md5_hash_key(hp, key))
1093 if (crypto_hash_final(desc, md5_hash))
1096 tcp_put_md5sig_pool();
1100 tcp_put_md5sig_pool();
1102 memset(md5_hash, 0, 16);
1106 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1107 const struct sock *sk, const struct request_sock *req,
1108 const struct sk_buff *skb)
1110 struct tcp_md5sig_pool *hp;
1111 struct hash_desc *desc;
1112 const struct tcphdr *th = tcp_hdr(skb);
1113 __be32 saddr, daddr;
1116 saddr = inet_sk(sk)->inet_saddr;
1117 daddr = inet_sk(sk)->inet_daddr;
1119 saddr = inet_rsk(req)->ir_loc_addr;
1120 daddr = inet_rsk(req)->ir_rmt_addr;
1122 const struct iphdr *iph = ip_hdr(skb);
1127 hp = tcp_get_md5sig_pool();
1129 goto clear_hash_noput;
1130 desc = &hp->md5_desc;
1132 if (crypto_hash_init(desc))
1135 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1137 if (tcp_md5_hash_header(hp, th))
1139 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1141 if (tcp_md5_hash_key(hp, key))
1143 if (crypto_hash_final(desc, md5_hash))
1146 tcp_put_md5sig_pool();
1150 tcp_put_md5sig_pool();
1152 memset(md5_hash, 0, 16);
1155 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1157 static bool __tcp_v4_inbound_md5_hash(struct sock *sk,
1158 const struct sk_buff *skb)
1161 * This gets called for each TCP segment that arrives
1162 * so we want to be efficient.
1163 * We have 3 drop cases:
1164 * o No MD5 hash and one expected.
1165 * o MD5 hash and we're not expecting one.
1166 * o MD5 hash and its wrong.
1168 const __u8 *hash_location = NULL;
1169 struct tcp_md5sig_key *hash_expected;
1170 const struct iphdr *iph = ip_hdr(skb);
1171 const struct tcphdr *th = tcp_hdr(skb);
1173 unsigned char newhash[16];
1175 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1177 hash_location = tcp_parse_md5sig_option(th);
1179 /* We've parsed the options - do we have a hash? */
1180 if (!hash_expected && !hash_location)
1183 if (hash_expected && !hash_location) {
1184 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1188 if (!hash_expected && hash_location) {
1189 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1193 /* Okay, so this is hash_expected and hash_location -
1194 * so we need to calculate the checksum.
1196 genhash = tcp_v4_md5_hash_skb(newhash,
1200 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1201 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1202 &iph->saddr, ntohs(th->source),
1203 &iph->daddr, ntohs(th->dest),
1204 genhash ? " tcp_v4_calc_md5_hash failed"
1211 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1216 ret = __tcp_v4_inbound_md5_hash(sk, skb);
1224 static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener,
1225 struct sk_buff *skb)
1227 struct inet_request_sock *ireq = inet_rsk(req);
1229 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1230 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1231 ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1232 ireq->opt = tcp_v4_save_options(skb);
1233 ireq->ireq_family = AF_INET;
1236 static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
1237 const struct request_sock *req,
1240 struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1243 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1252 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1254 .obj_size = sizeof(struct tcp_request_sock),
1255 .rtx_syn_ack = tcp_rtx_synack,
1256 .send_ack = tcp_v4_reqsk_send_ack,
1257 .destructor = tcp_v4_reqsk_destructor,
1258 .send_reset = tcp_v4_send_reset,
1259 .syn_ack_timeout = tcp_syn_ack_timeout,
1262 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1263 .mss_clamp = TCP_MSS_DEFAULT,
1264 #ifdef CONFIG_TCP_MD5SIG
1265 .md5_lookup = tcp_v4_reqsk_md5_lookup,
1266 .calc_md5_hash = tcp_v4_md5_hash_skb,
1268 .init_req = tcp_v4_init_req,
1269 #ifdef CONFIG_SYN_COOKIES
1270 .cookie_init_seq = cookie_v4_init_sequence,
1272 .route_req = tcp_v4_route_req,
1273 .init_seq = tcp_v4_init_sequence,
1274 .send_synack = tcp_v4_send_synack,
1275 .queue_hash_add = inet_csk_reqsk_queue_hash_add,
1278 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1280 /* Never answer to SYNs send to broadcast or multicast */
1281 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1284 return tcp_conn_request(&tcp_request_sock_ops,
1285 &tcp_request_sock_ipv4_ops, sk, skb);
1288 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1291 EXPORT_SYMBOL(tcp_v4_conn_request);
1295 * The three way handshake has completed - we got a valid synack -
1296 * now create the new socket.
1298 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1299 struct request_sock *req,
1300 struct dst_entry *dst)
1302 struct inet_request_sock *ireq;
1303 struct inet_sock *newinet;
1304 struct tcp_sock *newtp;
1306 #ifdef CONFIG_TCP_MD5SIG
1307 struct tcp_md5sig_key *key;
1309 struct ip_options_rcu *inet_opt;
1311 if (sk_acceptq_is_full(sk))
1314 newsk = tcp_create_openreq_child(sk, req, skb);
1318 newsk->sk_gso_type = SKB_GSO_TCPV4;
1319 inet_sk_rx_dst_set(newsk, skb);
1321 newtp = tcp_sk(newsk);
1322 newinet = inet_sk(newsk);
1323 ireq = inet_rsk(req);
1324 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1325 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1326 newinet->inet_saddr = ireq->ir_loc_addr;
1327 inet_opt = ireq->opt;
1328 rcu_assign_pointer(newinet->inet_opt, inet_opt);
1330 newinet->mc_index = inet_iif(skb);
1331 newinet->mc_ttl = ip_hdr(skb)->ttl;
1332 newinet->rcv_tos = ip_hdr(skb)->tos;
1333 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1334 inet_set_txhash(newsk);
1336 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1337 newinet->inet_id = newtp->write_seq ^ jiffies;
1340 dst = inet_csk_route_child_sock(sk, newsk, req);
1344 /* syncookie case : see end of cookie_v4_check() */
1346 sk_setup_caps(newsk, dst);
1348 tcp_ca_openreq_child(newsk, dst);
1350 tcp_sync_mss(newsk, dst_mtu(dst));
1351 newtp->advmss = dst_metric_advmss(dst);
1352 if (tcp_sk(sk)->rx_opt.user_mss &&
1353 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1354 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1356 tcp_initialize_rcv_mss(newsk);
1358 #ifdef CONFIG_TCP_MD5SIG
1359 /* Copy over the MD5 key from the original socket */
1360 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1364 * We're using one, so create a matching key
1365 * on the newsk structure. If we fail to get
1366 * memory, then we end up not copying the key
1369 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1370 AF_INET, key->key, key->keylen, GFP_ATOMIC);
1371 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1375 if (__inet_inherit_port(sk, newsk) < 0)
1377 __inet_hash_nolisten(newsk, NULL);
1382 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1386 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1389 inet_csk_prepare_forced_close(newsk);
1393 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1395 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1397 const struct tcphdr *th = tcp_hdr(skb);
1398 const struct iphdr *iph = ip_hdr(skb);
1399 struct request_sock *req;
1402 req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr);
1404 nsk = tcp_check_req(sk, skb, req, false);
1409 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1410 th->source, iph->daddr, th->dest, inet_iif(skb));
1413 if (nsk->sk_state != TCP_TIME_WAIT) {
1417 inet_twsk_put(inet_twsk(nsk));
1421 #ifdef CONFIG_SYN_COOKIES
1423 sk = cookie_v4_check(sk, skb);
1428 /* The socket must have it's spinlock held when we get
1431 * We have a potential double-lock case here, so even when
1432 * doing backlog processing we use the BH locking scheme.
1433 * This is because we cannot sleep with the original spinlock
1436 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1440 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1441 struct dst_entry *dst = sk->sk_rx_dst;
1443 sock_rps_save_rxhash(sk, skb);
1444 sk_mark_napi_id(sk, skb);
1446 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1447 dst->ops->check(dst, 0) == NULL) {
1449 sk->sk_rx_dst = NULL;
1452 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1456 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1459 if (sk->sk_state == TCP_LISTEN) {
1460 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1465 sock_rps_save_rxhash(nsk, skb);
1466 sk_mark_napi_id(sk, skb);
1467 if (tcp_child_process(sk, nsk, skb)) {
1474 sock_rps_save_rxhash(sk, skb);
1476 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1483 tcp_v4_send_reset(rsk, skb);
1486 /* Be careful here. If this function gets more complicated and
1487 * gcc suffers from register pressure on the x86, sk (in %ebx)
1488 * might be destroyed here. This current version compiles correctly,
1489 * but you have been warned.
1494 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1495 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1498 EXPORT_SYMBOL(tcp_v4_do_rcv);
1500 void tcp_v4_early_demux(struct sk_buff *skb)
1502 const struct iphdr *iph;
1503 const struct tcphdr *th;
1506 if (skb->pkt_type != PACKET_HOST)
1509 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1515 if (th->doff < sizeof(struct tcphdr) / 4)
1518 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1519 iph->saddr, th->source,
1520 iph->daddr, ntohs(th->dest),
1524 skb->destructor = sock_edemux;
1525 if (sk_fullsock(sk)) {
1526 struct dst_entry *dst = sk->sk_rx_dst;
1529 dst = dst_check(dst, 0);
1531 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1532 skb_dst_set_noref(skb, dst);
1537 /* Packet is added to VJ-style prequeue for processing in process
1538 * context, if a reader task is waiting. Apparently, this exciting
1539 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1540 * failed somewhere. Latency? Burstiness? Well, at least now we will
1541 * see, why it failed. 8)8) --ANK
1544 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1546 struct tcp_sock *tp = tcp_sk(sk);
1548 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1551 if (skb->len <= tcp_hdrlen(skb) &&
1552 skb_queue_len(&tp->ucopy.prequeue) == 0)
1555 /* Before escaping RCU protected region, we need to take care of skb
1556 * dst. Prequeue is only enabled for established sockets.
1557 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1558 * Instead of doing full sk_rx_dst validity here, let's perform
1559 * an optimistic check.
1561 if (likely(sk->sk_rx_dst))
1566 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1567 tp->ucopy.memory += skb->truesize;
1568 if (tp->ucopy.memory > sk->sk_rcvbuf) {
1569 struct sk_buff *skb1;
1571 BUG_ON(sock_owned_by_user(sk));
1573 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1574 sk_backlog_rcv(sk, skb1);
1575 NET_INC_STATS_BH(sock_net(sk),
1576 LINUX_MIB_TCPPREQUEUEDROPPED);
1579 tp->ucopy.memory = 0;
1580 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1581 wake_up_interruptible_sync_poll(sk_sleep(sk),
1582 POLLIN | POLLRDNORM | POLLRDBAND);
1583 if (!inet_csk_ack_scheduled(sk))
1584 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1585 (3 * tcp_rto_min(sk)) / 4,
1590 EXPORT_SYMBOL(tcp_prequeue);
1596 int tcp_v4_rcv(struct sk_buff *skb)
1598 const struct iphdr *iph;
1599 const struct tcphdr *th;
1602 struct net *net = dev_net(skb->dev);
1604 if (skb->pkt_type != PACKET_HOST)
1607 /* Count it even if it's bad */
1608 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1610 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1615 if (th->doff < sizeof(struct tcphdr) / 4)
1617 if (!pskb_may_pull(skb, th->doff * 4))
1620 /* An explanation is required here, I think.
1621 * Packet length and doff are validated by header prediction,
1622 * provided case of th->doff==0 is eliminated.
1623 * So, we defer the checks. */
1625 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1630 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1631 * barrier() makes sure compiler wont play fool^Waliasing games.
1633 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1634 sizeof(struct inet_skb_parm));
1637 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1638 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1639 skb->len - th->doff * 4);
1640 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1641 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1642 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1643 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1644 TCP_SKB_CB(skb)->sacked = 0;
1646 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1651 if (sk->sk_state == TCP_TIME_WAIT)
1654 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1655 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1656 goto discard_and_relse;
1659 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1660 goto discard_and_relse;
1662 #ifdef CONFIG_TCP_MD5SIG
1664 * We really want to reject the packet as early as possible
1666 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1667 * o There is an MD5 option and we're not expecting one
1669 if (tcp_v4_inbound_md5_hash(sk, skb))
1670 goto discard_and_relse;
1675 if (sk_filter(sk, skb))
1676 goto discard_and_relse;
1678 sk_incoming_cpu_update(sk);
1681 bh_lock_sock_nested(sk);
1683 if (!sock_owned_by_user(sk)) {
1684 if (!tcp_prequeue(sk, skb))
1685 ret = tcp_v4_do_rcv(sk, skb);
1686 } else if (unlikely(sk_add_backlog(sk, skb,
1687 sk->sk_rcvbuf + sk->sk_sndbuf))) {
1689 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1690 goto discard_and_relse;
1699 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1702 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1704 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1706 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1708 tcp_v4_send_reset(NULL, skb);
1712 /* Discard frame. */
1721 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1722 inet_twsk_put(inet_twsk(sk));
1726 if (skb->len < (th->doff << 2)) {
1727 inet_twsk_put(inet_twsk(sk));
1730 if (tcp_checksum_complete(skb)) {
1731 inet_twsk_put(inet_twsk(sk));
1734 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1736 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1738 iph->saddr, th->source,
1739 iph->daddr, th->dest,
1742 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1743 inet_twsk_put(inet_twsk(sk));
1747 /* Fall through to ACK */
1750 tcp_v4_timewait_ack(sk, skb);
1754 case TCP_TW_SUCCESS:;
1759 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1760 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1761 .twsk_unique = tcp_twsk_unique,
1762 .twsk_destructor= tcp_twsk_destructor,
1765 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1767 struct dst_entry *dst = skb_dst(skb);
1771 sk->sk_rx_dst = dst;
1772 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1775 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1777 const struct inet_connection_sock_af_ops ipv4_specific = {
1778 .queue_xmit = ip_queue_xmit,
1779 .send_check = tcp_v4_send_check,
1780 .rebuild_header = inet_sk_rebuild_header,
1781 .sk_rx_dst_set = inet_sk_rx_dst_set,
1782 .conn_request = tcp_v4_conn_request,
1783 .syn_recv_sock = tcp_v4_syn_recv_sock,
1784 .net_header_len = sizeof(struct iphdr),
1785 .setsockopt = ip_setsockopt,
1786 .getsockopt = ip_getsockopt,
1787 .addr2sockaddr = inet_csk_addr2sockaddr,
1788 .sockaddr_len = sizeof(struct sockaddr_in),
1789 .bind_conflict = inet_csk_bind_conflict,
1790 #ifdef CONFIG_COMPAT
1791 .compat_setsockopt = compat_ip_setsockopt,
1792 .compat_getsockopt = compat_ip_getsockopt,
1794 .mtu_reduced = tcp_v4_mtu_reduced,
1796 EXPORT_SYMBOL(ipv4_specific);
1798 #ifdef CONFIG_TCP_MD5SIG
1799 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1800 .md5_lookup = tcp_v4_md5_lookup,
1801 .calc_md5_hash = tcp_v4_md5_hash_skb,
1802 .md5_parse = tcp_v4_parse_md5_keys,
1806 /* NOTE: A lot of things set to zero explicitly by call to
1807 * sk_alloc() so need not be done here.
1809 static int tcp_v4_init_sock(struct sock *sk)
1811 struct inet_connection_sock *icsk = inet_csk(sk);
1815 icsk->icsk_af_ops = &ipv4_specific;
1817 #ifdef CONFIG_TCP_MD5SIG
1818 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1824 void tcp_v4_destroy_sock(struct sock *sk)
1826 struct tcp_sock *tp = tcp_sk(sk);
1828 tcp_clear_xmit_timers(sk);
1830 tcp_cleanup_congestion_control(sk);
1832 /* Cleanup up the write buffer. */
1833 tcp_write_queue_purge(sk);
1835 /* Cleans up our, hopefully empty, out_of_order_queue. */
1836 __skb_queue_purge(&tp->out_of_order_queue);
1838 #ifdef CONFIG_TCP_MD5SIG
1839 /* Clean up the MD5 key list, if any */
1840 if (tp->md5sig_info) {
1841 tcp_clear_md5_list(sk);
1842 kfree_rcu(tp->md5sig_info, rcu);
1843 tp->md5sig_info = NULL;
1847 /* Clean prequeue, it must be empty really */
1848 __skb_queue_purge(&tp->ucopy.prequeue);
1850 /* Clean up a referenced TCP bind bucket. */
1851 if (inet_csk(sk)->icsk_bind_hash)
1854 BUG_ON(tp->fastopen_rsk != NULL);
1856 /* If socket is aborted during connect operation */
1857 tcp_free_fastopen_req(tp);
1859 sk_sockets_allocated_dec(sk);
1860 sock_release_memcg(sk);
1862 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1864 #ifdef CONFIG_PROC_FS
1865 /* Proc filesystem TCP sock list dumping. */
1868 * Get next listener socket follow cur. If cur is NULL, get first socket
1869 * starting from bucket given in st->bucket; when st->bucket is zero the
1870 * very first socket in the hash table is returned.
1872 static void *listening_get_next(struct seq_file *seq, void *cur)
1874 struct inet_connection_sock *icsk;
1875 struct hlist_nulls_node *node;
1876 struct sock *sk = cur;
1877 struct inet_listen_hashbucket *ilb;
1878 struct tcp_iter_state *st = seq->private;
1879 struct net *net = seq_file_net(seq);
1882 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1883 spin_lock_bh(&ilb->lock);
1884 sk = sk_nulls_head(&ilb->head);
1888 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1892 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1893 struct request_sock *req = cur;
1895 icsk = inet_csk(st->syn_wait_sk);
1899 if (req->rsk_ops->family == st->family) {
1905 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1908 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1910 sk = sk_nulls_next(st->syn_wait_sk);
1911 st->state = TCP_SEQ_STATE_LISTENING;
1912 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1914 icsk = inet_csk(sk);
1915 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1916 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1918 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1919 sk = sk_nulls_next(sk);
1922 sk_nulls_for_each_from(sk, node) {
1923 if (!net_eq(sock_net(sk), net))
1925 if (sk->sk_family == st->family) {
1929 icsk = inet_csk(sk);
1930 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1931 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1933 st->uid = sock_i_uid(sk);
1934 st->syn_wait_sk = sk;
1935 st->state = TCP_SEQ_STATE_OPENREQ;
1939 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1941 spin_unlock_bh(&ilb->lock);
1943 if (++st->bucket < INET_LHTABLE_SIZE) {
1944 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1945 spin_lock_bh(&ilb->lock);
1946 sk = sk_nulls_head(&ilb->head);
1954 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1956 struct tcp_iter_state *st = seq->private;
1961 rc = listening_get_next(seq, NULL);
1963 while (rc && *pos) {
1964 rc = listening_get_next(seq, rc);
1970 static inline bool empty_bucket(const struct tcp_iter_state *st)
1972 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1976 * Get first established socket starting from bucket given in st->bucket.
1977 * If st->bucket is zero, the very first socket in the hash is returned.
1979 static void *established_get_first(struct seq_file *seq)
1981 struct tcp_iter_state *st = seq->private;
1982 struct net *net = seq_file_net(seq);
1986 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1988 struct hlist_nulls_node *node;
1989 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1991 /* Lockless fast path for the common case of empty buckets */
1992 if (empty_bucket(st))
1996 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1997 if (sk->sk_family != st->family ||
1998 !net_eq(sock_net(sk), net)) {
2004 spin_unlock_bh(lock);
2010 static void *established_get_next(struct seq_file *seq, void *cur)
2012 struct sock *sk = cur;
2013 struct hlist_nulls_node *node;
2014 struct tcp_iter_state *st = seq->private;
2015 struct net *net = seq_file_net(seq);
2020 sk = sk_nulls_next(sk);
2022 sk_nulls_for_each_from(sk, node) {
2023 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2027 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2029 return established_get_first(seq);
2032 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2034 struct tcp_iter_state *st = seq->private;
2038 rc = established_get_first(seq);
2041 rc = established_get_next(seq, rc);
2047 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2050 struct tcp_iter_state *st = seq->private;
2052 st->state = TCP_SEQ_STATE_LISTENING;
2053 rc = listening_get_idx(seq, &pos);
2056 st->state = TCP_SEQ_STATE_ESTABLISHED;
2057 rc = established_get_idx(seq, pos);
2063 static void *tcp_seek_last_pos(struct seq_file *seq)
2065 struct tcp_iter_state *st = seq->private;
2066 int offset = st->offset;
2067 int orig_num = st->num;
2070 switch (st->state) {
2071 case TCP_SEQ_STATE_OPENREQ:
2072 case TCP_SEQ_STATE_LISTENING:
2073 if (st->bucket >= INET_LHTABLE_SIZE)
2075 st->state = TCP_SEQ_STATE_LISTENING;
2076 rc = listening_get_next(seq, NULL);
2077 while (offset-- && rc)
2078 rc = listening_get_next(seq, rc);
2082 st->state = TCP_SEQ_STATE_ESTABLISHED;
2084 case TCP_SEQ_STATE_ESTABLISHED:
2085 if (st->bucket > tcp_hashinfo.ehash_mask)
2087 rc = established_get_first(seq);
2088 while (offset-- && rc)
2089 rc = established_get_next(seq, rc);
2097 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2099 struct tcp_iter_state *st = seq->private;
2102 if (*pos && *pos == st->last_pos) {
2103 rc = tcp_seek_last_pos(seq);
2108 st->state = TCP_SEQ_STATE_LISTENING;
2112 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2115 st->last_pos = *pos;
2119 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2121 struct tcp_iter_state *st = seq->private;
2124 if (v == SEQ_START_TOKEN) {
2125 rc = tcp_get_idx(seq, 0);
2129 switch (st->state) {
2130 case TCP_SEQ_STATE_OPENREQ:
2131 case TCP_SEQ_STATE_LISTENING:
2132 rc = listening_get_next(seq, v);
2134 st->state = TCP_SEQ_STATE_ESTABLISHED;
2137 rc = established_get_first(seq);
2140 case TCP_SEQ_STATE_ESTABLISHED:
2141 rc = established_get_next(seq, v);
2146 st->last_pos = *pos;
2150 static void tcp_seq_stop(struct seq_file *seq, void *v)
2152 struct tcp_iter_state *st = seq->private;
2154 switch (st->state) {
2155 case TCP_SEQ_STATE_OPENREQ:
2157 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2158 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2160 case TCP_SEQ_STATE_LISTENING:
2161 if (v != SEQ_START_TOKEN)
2162 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2164 case TCP_SEQ_STATE_ESTABLISHED:
2166 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2171 int tcp_seq_open(struct inode *inode, struct file *file)
2173 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2174 struct tcp_iter_state *s;
2177 err = seq_open_net(inode, file, &afinfo->seq_ops,
2178 sizeof(struct tcp_iter_state));
2182 s = ((struct seq_file *)file->private_data)->private;
2183 s->family = afinfo->family;
2187 EXPORT_SYMBOL(tcp_seq_open);
2189 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2192 struct proc_dir_entry *p;
2194 afinfo->seq_ops.start = tcp_seq_start;
2195 afinfo->seq_ops.next = tcp_seq_next;
2196 afinfo->seq_ops.stop = tcp_seq_stop;
2198 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2199 afinfo->seq_fops, afinfo);
2204 EXPORT_SYMBOL(tcp_proc_register);
2206 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2208 remove_proc_entry(afinfo->name, net->proc_net);
2210 EXPORT_SYMBOL(tcp_proc_unregister);
2212 static void get_openreq4(const struct request_sock *req,
2213 struct seq_file *f, int i, kuid_t uid)
2215 const struct inet_request_sock *ireq = inet_rsk(req);
2216 long delta = req->rsk_timer.expires - jiffies;
2218 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2219 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2224 ntohs(ireq->ir_rmt_port),
2226 0, 0, /* could print option size, but that is af dependent. */
2227 1, /* timers active (only the expire timer) */
2228 jiffies_delta_to_clock_t(delta),
2230 from_kuid_munged(seq_user_ns(f), uid),
2231 0, /* non standard timer */
2232 0, /* open_requests have no inode */
2237 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2240 unsigned long timer_expires;
2241 const struct tcp_sock *tp = tcp_sk(sk);
2242 const struct inet_connection_sock *icsk = inet_csk(sk);
2243 const struct inet_sock *inet = inet_sk(sk);
2244 struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2245 __be32 dest = inet->inet_daddr;
2246 __be32 src = inet->inet_rcv_saddr;
2247 __u16 destp = ntohs(inet->inet_dport);
2248 __u16 srcp = ntohs(inet->inet_sport);
2251 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2252 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2253 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2255 timer_expires = icsk->icsk_timeout;
2256 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2258 timer_expires = icsk->icsk_timeout;
2259 } else if (timer_pending(&sk->sk_timer)) {
2261 timer_expires = sk->sk_timer.expires;
2264 timer_expires = jiffies;
2267 if (sk->sk_state == TCP_LISTEN)
2268 rx_queue = sk->sk_ack_backlog;
2271 * because we dont lock socket, we might find a transient negative value
2273 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2275 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2276 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2277 i, src, srcp, dest, destp, sk->sk_state,
2278 tp->write_seq - tp->snd_una,
2281 jiffies_delta_to_clock_t(timer_expires - jiffies),
2282 icsk->icsk_retransmits,
2283 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2284 icsk->icsk_probes_out,
2286 atomic_read(&sk->sk_refcnt), sk,
2287 jiffies_to_clock_t(icsk->icsk_rto),
2288 jiffies_to_clock_t(icsk->icsk_ack.ato),
2289 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2291 sk->sk_state == TCP_LISTEN ?
2292 (fastopenq ? fastopenq->max_qlen : 0) :
2293 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2296 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2297 struct seq_file *f, int i)
2301 s32 delta = tw->tw_ttd - inet_tw_time_stamp();
2303 dest = tw->tw_daddr;
2304 src = tw->tw_rcv_saddr;
2305 destp = ntohs(tw->tw_dport);
2306 srcp = ntohs(tw->tw_sport);
2308 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2309 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2310 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2311 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2312 atomic_read(&tw->tw_refcnt), tw);
2317 static int tcp4_seq_show(struct seq_file *seq, void *v)
2319 struct tcp_iter_state *st;
2320 struct sock *sk = v;
2322 seq_setwidth(seq, TMPSZ - 1);
2323 if (v == SEQ_START_TOKEN) {
2324 seq_puts(seq, " sl local_address rem_address st tx_queue "
2325 "rx_queue tr tm->when retrnsmt uid timeout "
2331 switch (st->state) {
2332 case TCP_SEQ_STATE_LISTENING:
2333 case TCP_SEQ_STATE_ESTABLISHED:
2334 if (sk->sk_state == TCP_TIME_WAIT)
2335 get_timewait4_sock(v, seq, st->num);
2337 get_tcp4_sock(v, seq, st->num);
2339 case TCP_SEQ_STATE_OPENREQ:
2340 get_openreq4(v, seq, st->num, st->uid);
2348 static const struct file_operations tcp_afinfo_seq_fops = {
2349 .owner = THIS_MODULE,
2350 .open = tcp_seq_open,
2352 .llseek = seq_lseek,
2353 .release = seq_release_net
2356 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2359 .seq_fops = &tcp_afinfo_seq_fops,
2361 .show = tcp4_seq_show,
2365 static int __net_init tcp4_proc_init_net(struct net *net)
2367 return tcp_proc_register(net, &tcp4_seq_afinfo);
2370 static void __net_exit tcp4_proc_exit_net(struct net *net)
2372 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2375 static struct pernet_operations tcp4_net_ops = {
2376 .init = tcp4_proc_init_net,
2377 .exit = tcp4_proc_exit_net,
2380 int __init tcp4_proc_init(void)
2382 return register_pernet_subsys(&tcp4_net_ops);
2385 void tcp4_proc_exit(void)
2387 unregister_pernet_subsys(&tcp4_net_ops);
2389 #endif /* CONFIG_PROC_FS */
2391 struct proto tcp_prot = {
2393 .owner = THIS_MODULE,
2395 .connect = tcp_v4_connect,
2396 .disconnect = tcp_disconnect,
2397 .accept = inet_csk_accept,
2399 .init = tcp_v4_init_sock,
2400 .destroy = tcp_v4_destroy_sock,
2401 .shutdown = tcp_shutdown,
2402 .setsockopt = tcp_setsockopt,
2403 .getsockopt = tcp_getsockopt,
2404 .recvmsg = tcp_recvmsg,
2405 .sendmsg = tcp_sendmsg,
2406 .sendpage = tcp_sendpage,
2407 .backlog_rcv = tcp_v4_do_rcv,
2408 .release_cb = tcp_release_cb,
2410 .unhash = inet_unhash,
2411 .get_port = inet_csk_get_port,
2412 .enter_memory_pressure = tcp_enter_memory_pressure,
2413 .stream_memory_free = tcp_stream_memory_free,
2414 .sockets_allocated = &tcp_sockets_allocated,
2415 .orphan_count = &tcp_orphan_count,
2416 .memory_allocated = &tcp_memory_allocated,
2417 .memory_pressure = &tcp_memory_pressure,
2418 .sysctl_mem = sysctl_tcp_mem,
2419 .sysctl_wmem = sysctl_tcp_wmem,
2420 .sysctl_rmem = sysctl_tcp_rmem,
2421 .max_header = MAX_TCP_HEADER,
2422 .obj_size = sizeof(struct tcp_sock),
2423 .slab_flags = SLAB_DESTROY_BY_RCU,
2424 .twsk_prot = &tcp_timewait_sock_ops,
2425 .rsk_prot = &tcp_request_sock_ops,
2426 .h.hashinfo = &tcp_hashinfo,
2427 .no_autobind = true,
2428 #ifdef CONFIG_COMPAT
2429 .compat_setsockopt = compat_tcp_setsockopt,
2430 .compat_getsockopt = compat_tcp_getsockopt,
2432 #ifdef CONFIG_MEMCG_KMEM
2433 .init_cgroup = tcp_init_cgroup,
2434 .destroy_cgroup = tcp_destroy_cgroup,
2435 .proto_cgroup = tcp_proto_cgroup,
2438 EXPORT_SYMBOL(tcp_prot);
2440 static void __net_exit tcp_sk_exit(struct net *net)
2444 for_each_possible_cpu(cpu)
2445 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2446 free_percpu(net->ipv4.tcp_sk);
2449 static int __net_init tcp_sk_init(struct net *net)
2453 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2454 if (!net->ipv4.tcp_sk)
2457 for_each_possible_cpu(cpu) {
2460 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2464 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2466 net->ipv4.sysctl_tcp_ecn = 2;
2467 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2468 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2469 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2478 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2480 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2483 static struct pernet_operations __net_initdata tcp_sk_ops = {
2484 .init = tcp_sk_init,
2485 .exit = tcp_sk_exit,
2486 .exit_batch = tcp_sk_exit_batch,
2489 void __init tcp_v4_init(void)
2491 inet_hashinfo_init(&tcp_hashinfo);
2492 if (register_pernet_subsys(&tcp_sk_ops))
2493 panic("Failed to create the TCP control socket.\n");