1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
48 #define pr_fmt(fmt) "TCP: " fmt
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
61 #include <net/net_namespace.h>
63 #include <net/inet_hashtables.h>
65 #include <net/transp_v6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
84 #include <trace/events/tcp.h>
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98 return secure_tcp_seq(ip_hdr(skb)->daddr,
101 tcp_hdr(skb)->source);
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112 const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 struct tcp_sock *tp = tcp_sk(sk);
117 /* Still does not detect *everything* that goes through
118 * lo, since we require a loopback src or dst address
119 * or direct binding to 'lo' interface.
121 bool loopback = false;
122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124 #if IS_ENABLED(CONFIG_IPV6)
125 if (tw->tw_family == AF_INET6) {
126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
134 if (ipv4_is_loopback(tw->tw_daddr) ||
135 ipv4_is_loopback(tw->tw_rcv_saddr))
142 /* With PAWS, it is safe from the viewpoint
143 of data integrity. Even without PAWS it is safe provided sequence
144 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
146 Actually, the idea is close to VJ's one, only timestamp cache is
147 held not per host, but per port pair and TW bucket is used as state
150 If TW bucket has been already destroyed we fall back to VJ's scheme
151 and use initial timestamp retrieved from peer table.
153 if (tcptw->tw_ts_recent_stamp &&
154 (!twp || (reuse && time_after32(ktime_get_seconds(),
155 tcptw->tw_ts_recent_stamp)))) {
156 /* In case of repair and re-using TIME-WAIT sockets we still
157 * want to be sure that it is safe as above but honor the
158 * sequence numbers and time stamps set as part of the repair
161 * Without this check re-using a TIME-WAIT socket with TCP
162 * repair would accumulate a -1 on the repair assigned
163 * sequence number. The first time it is reused the sequence
164 * is -1, the second time -2, etc. This fixes that issue
165 * without appearing to create any others.
167 if (likely(!tp->repair)) {
168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
172 WRITE_ONCE(tp->write_seq, seq);
173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
187 /* This check is replicated from tcp_v4_connect() and intended to
188 * prevent BPF program called below from accessing bytes that are out
189 * of the bound specified by user in addr_len.
191 if (addr_len < sizeof(struct sockaddr_in))
194 sock_owned_by_me(sk);
196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
199 /* This will initiate an outgoing connection. */
200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
202 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203 struct inet_sock *inet = inet_sk(sk);
204 struct tcp_sock *tp = tcp_sk(sk);
205 __be16 orig_sport, orig_dport;
206 __be32 daddr, nexthop;
210 struct ip_options_rcu *inet_opt;
211 struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
213 if (addr_len < sizeof(struct sockaddr_in))
216 if (usin->sin_family != AF_INET)
217 return -EAFNOSUPPORT;
219 nexthop = daddr = usin->sin_addr.s_addr;
220 inet_opt = rcu_dereference_protected(inet->inet_opt,
221 lockdep_sock_is_held(sk));
222 if (inet_opt && inet_opt->opt.srr) {
225 nexthop = inet_opt->opt.faddr;
228 orig_sport = inet->inet_sport;
229 orig_dport = usin->sin_port;
230 fl4 = &inet->cork.fl.u.ip4;
231 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
232 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
236 if (err == -ENETUNREACH)
237 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
241 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
246 if (!inet_opt || !inet_opt->opt.srr)
249 if (!inet->inet_saddr)
250 inet->inet_saddr = fl4->saddr;
251 sk_rcv_saddr_set(sk, inet->inet_saddr);
253 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
254 /* Reset inherited state */
255 tp->rx_opt.ts_recent = 0;
256 tp->rx_opt.ts_recent_stamp = 0;
257 if (likely(!tp->repair))
258 WRITE_ONCE(tp->write_seq, 0);
261 inet->inet_dport = usin->sin_port;
262 sk_daddr_set(sk, daddr);
264 inet_csk(sk)->icsk_ext_hdr_len = 0;
266 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
268 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
270 /* Socket identity is still unknown (sport may be zero).
271 * However we set state to SYN-SENT and not releasing socket
272 * lock select source port, enter ourselves into the hash tables and
273 * complete initialization after this.
275 tcp_set_state(sk, TCP_SYN_SENT);
276 err = inet_hash_connect(tcp_death_row, sk);
282 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
283 inet->inet_sport, inet->inet_dport, sk);
289 /* OK, now commit destination to socket. */
290 sk->sk_gso_type = SKB_GSO_TCPV4;
291 sk_setup_caps(sk, &rt->dst);
294 if (likely(!tp->repair)) {
296 WRITE_ONCE(tp->write_seq,
297 secure_tcp_seq(inet->inet_saddr,
301 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
306 inet->inet_id = prandom_u32();
308 if (tcp_fastopen_defer_connect(sk, &err))
313 err = tcp_connect(sk);
322 * This unhashes the socket and releases the local port,
325 tcp_set_state(sk, TCP_CLOSE);
327 sk->sk_route_caps = 0;
328 inet->inet_dport = 0;
331 EXPORT_SYMBOL(tcp_v4_connect);
334 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
335 * It can be called through tcp_release_cb() if socket was owned by user
336 * at the time tcp_v4_err() was called to handle ICMP message.
338 void tcp_v4_mtu_reduced(struct sock *sk)
340 struct inet_sock *inet = inet_sk(sk);
341 struct dst_entry *dst;
344 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
346 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
347 dst = inet_csk_update_pmtu(sk, mtu);
351 /* Something is about to be wrong... Remember soft error
352 * for the case, if this connection will not able to recover.
354 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
355 sk->sk_err_soft = EMSGSIZE;
359 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
360 ip_sk_accept_pmtu(sk) &&
361 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
362 tcp_sync_mss(sk, mtu);
364 /* Resend the TCP packet because it's
365 * clear that the old packet has been
366 * dropped. This is the new "fast" path mtu
369 tcp_simple_retransmit(sk);
370 } /* else let the usual retransmit timer handle it */
372 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
374 static void do_redirect(struct sk_buff *skb, struct sock *sk)
376 struct dst_entry *dst = __sk_dst_check(sk, 0);
379 dst->ops->redirect(dst, sk, skb);
383 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
384 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
386 struct request_sock *req = inet_reqsk(sk);
387 struct net *net = sock_net(sk);
389 /* ICMPs are not backlogged, hence we cannot get
390 * an established socket here.
392 if (seq != tcp_rsk(req)->snt_isn) {
393 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
396 * Still in SYN_RECV, just remove it silently.
397 * There is no good way to pass the error to the newly
398 * created socket, and POSIX does not want network
399 * errors returned from accept().
401 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
402 tcp_listendrop(req->rsk_listener);
406 EXPORT_SYMBOL(tcp_req_err);
408 /* TCP-LD (RFC 6069) logic */
409 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
411 struct inet_connection_sock *icsk = inet_csk(sk);
412 struct tcp_sock *tp = tcp_sk(sk);
417 if (sock_owned_by_user(sk))
420 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
424 skb = tcp_rtx_queue_head(sk);
425 if (WARN_ON_ONCE(!skb))
428 icsk->icsk_backoff--;
429 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
430 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
432 tcp_mstamp_refresh(tp);
433 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
434 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
437 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
438 remaining, TCP_RTO_MAX);
440 /* RTO revert clocked out retransmission.
441 * Will retransmit now.
443 tcp_retransmit_timer(sk);
446 EXPORT_SYMBOL(tcp_ld_RTO_revert);
449 * This routine is called by the ICMP module when it gets some
450 * sort of error condition. If err < 0 then the socket should
451 * be closed and the error returned to the user. If err > 0
452 * it's just the icmp type << 8 | icmp code. After adjustment
453 * header points to the first 8 bytes of the tcp header. We need
454 * to find the appropriate port.
456 * The locking strategy used here is very "optimistic". When
457 * someone else accesses the socket the ICMP is just dropped
458 * and for some paths there is no check at all.
459 * A more general error queue to queue errors for later handling
460 * is probably better.
464 int tcp_v4_err(struct sk_buff *skb, u32 info)
466 const struct iphdr *iph = (const struct iphdr *)skb->data;
467 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
469 struct inet_sock *inet;
470 const int type = icmp_hdr(skb)->type;
471 const int code = icmp_hdr(skb)->code;
473 struct request_sock *fastopen;
476 struct net *net = dev_net(skb->dev);
478 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
479 th->dest, iph->saddr, ntohs(th->source),
482 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
485 if (sk->sk_state == TCP_TIME_WAIT) {
486 inet_twsk_put(inet_twsk(sk));
489 seq = ntohl(th->seq);
490 if (sk->sk_state == TCP_NEW_SYN_RECV) {
491 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
492 type == ICMP_TIME_EXCEEDED ||
493 (type == ICMP_DEST_UNREACH &&
494 (code == ICMP_NET_UNREACH ||
495 code == ICMP_HOST_UNREACH)));
500 /* If too many ICMPs get dropped on busy
501 * servers this needs to be solved differently.
502 * We do take care of PMTU discovery (RFC1191) special case :
503 * we can receive locally generated ICMP messages while socket is held.
505 if (sock_owned_by_user(sk)) {
506 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
507 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
509 if (sk->sk_state == TCP_CLOSE)
512 if (static_branch_unlikely(&ip4_min_ttl)) {
513 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
514 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
515 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
521 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
522 fastopen = rcu_dereference(tp->fastopen_rsk);
523 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
524 if (sk->sk_state != TCP_LISTEN &&
525 !between(seq, snd_una, tp->snd_nxt)) {
526 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
532 if (!sock_owned_by_user(sk))
533 do_redirect(skb, sk);
535 case ICMP_SOURCE_QUENCH:
536 /* Just silently ignore these. */
538 case ICMP_PARAMETERPROB:
541 case ICMP_DEST_UNREACH:
542 if (code > NR_ICMP_UNREACH)
545 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
546 /* We are not interested in TCP_LISTEN and open_requests
547 * (SYN-ACKs send out by Linux are always <576bytes so
548 * they should go through unfragmented).
550 if (sk->sk_state == TCP_LISTEN)
553 WRITE_ONCE(tp->mtu_info, info);
554 if (!sock_owned_by_user(sk)) {
555 tcp_v4_mtu_reduced(sk);
557 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
563 err = icmp_err_convert[code].errno;
564 /* check if this ICMP message allows revert of backoff.
568 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
569 tcp_ld_RTO_revert(sk, seq);
571 case ICMP_TIME_EXCEEDED:
578 switch (sk->sk_state) {
581 /* Only in fast or simultaneous open. If a fast open socket is
582 * already accepted it is treated as a connected one below.
584 if (fastopen && !fastopen->sk)
587 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
589 if (!sock_owned_by_user(sk)) {
596 sk->sk_err_soft = err;
601 /* If we've already connected we will keep trying
602 * until we time out, or the user gives up.
604 * rfc1122 4.2.3.9 allows to consider as hard errors
605 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
606 * but it is obsoleted by pmtu discovery).
608 * Note, that in modern internet, where routing is unreliable
609 * and in each dark corner broken firewalls sit, sending random
610 * errors ordered by their masters even this two messages finally lose
611 * their original sense (even Linux sends invalid PORT_UNREACHs)
613 * Now we are in compliance with RFCs.
618 if (!sock_owned_by_user(sk) && inet->recverr) {
621 } else { /* Only an error on timeout */
622 sk->sk_err_soft = err;
631 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
633 struct tcphdr *th = tcp_hdr(skb);
635 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
636 skb->csum_start = skb_transport_header(skb) - skb->head;
637 skb->csum_offset = offsetof(struct tcphdr, check);
640 /* This routine computes an IPv4 TCP checksum. */
641 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
643 const struct inet_sock *inet = inet_sk(sk);
645 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
647 EXPORT_SYMBOL(tcp_v4_send_check);
650 * This routine will send an RST to the other tcp.
652 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
654 * Answer: if a packet caused RST, it is not for a socket
655 * existing in our system, if it is matched to a socket,
656 * it is just duplicate segment or bug in other side's TCP.
657 * So that we build reply only basing on parameters
658 * arrived with segment.
659 * Exception: precedence violation. We do not implement it in any case.
662 #ifdef CONFIG_TCP_MD5SIG
663 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
665 #define OPTION_BYTES sizeof(__be32)
668 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
670 const struct tcphdr *th = tcp_hdr(skb);
673 __be32 opt[OPTION_BYTES / sizeof(__be32)];
675 struct ip_reply_arg arg;
676 #ifdef CONFIG_TCP_MD5SIG
677 struct tcp_md5sig_key *key = NULL;
678 const __u8 *hash_location = NULL;
679 unsigned char newhash[16];
681 struct sock *sk1 = NULL;
683 u64 transmit_time = 0;
687 /* Never send a reset in response to a reset. */
691 /* If sk not NULL, it means we did a successful lookup and incoming
692 * route had to be correct. prequeue might have dropped our dst.
694 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
697 /* Swap the send and the receive. */
698 memset(&rep, 0, sizeof(rep));
699 rep.th.dest = th->source;
700 rep.th.source = th->dest;
701 rep.th.doff = sizeof(struct tcphdr) / 4;
705 rep.th.seq = th->ack_seq;
708 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
709 skb->len - (th->doff << 2));
712 memset(&arg, 0, sizeof(arg));
713 arg.iov[0].iov_base = (unsigned char *)&rep;
714 arg.iov[0].iov_len = sizeof(rep.th);
716 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
717 #ifdef CONFIG_TCP_MD5SIG
719 hash_location = tcp_parse_md5sig_option(th);
720 if (sk && sk_fullsock(sk)) {
721 const union tcp_md5_addr *addr;
724 /* sdif set, means packet ingressed via a device
725 * in an L3 domain and inet_iif is set to it.
727 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
728 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
729 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
730 } else if (hash_location) {
731 const union tcp_md5_addr *addr;
732 int sdif = tcp_v4_sdif(skb);
733 int dif = inet_iif(skb);
737 * active side is lost. Try to find listening socket through
738 * source port, and then find md5 key through listening socket.
739 * we are not loose security here:
740 * Incoming packet is checked with md5 hash with finding key,
741 * no RST generated if md5 hash doesn't match.
743 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
745 th->source, ip_hdr(skb)->daddr,
746 ntohs(th->source), dif, sdif);
747 /* don't send rst if it can't find key */
751 /* sdif set, means packet ingressed via a device
752 * in an L3 domain and dif is set to it.
754 l3index = sdif ? dif : 0;
755 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
756 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
761 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
762 if (genhash || memcmp(hash_location, newhash, 16) != 0)
768 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
770 (TCPOPT_MD5SIG << 8) |
772 /* Update length and the length the header thinks exists */
773 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
774 rep.th.doff = arg.iov[0].iov_len / 4;
776 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
777 key, ip_hdr(skb)->saddr,
778 ip_hdr(skb)->daddr, &rep.th);
781 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
782 if (rep.opt[0] == 0) {
783 __be32 mrst = mptcp_reset_option(skb);
787 arg.iov[0].iov_len += sizeof(mrst);
788 rep.th.doff = arg.iov[0].iov_len / 4;
792 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
793 ip_hdr(skb)->saddr, /* XXX */
794 arg.iov[0].iov_len, IPPROTO_TCP, 0);
795 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
796 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
798 /* When socket is gone, all binding information is lost.
799 * routing might fail in this case. No choice here, if we choose to force
800 * input interface, we will misroute in case of asymmetric route.
803 arg.bound_dev_if = sk->sk_bound_dev_if;
805 trace_tcp_send_reset(sk, skb);
808 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
809 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
811 arg.tos = ip_hdr(skb)->tos;
812 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
814 ctl_sk = this_cpu_read(ipv4_tcp_sk);
815 sock_net_set(ctl_sk, net);
817 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
818 inet_twsk(sk)->tw_mark : sk->sk_mark;
819 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
820 inet_twsk(sk)->tw_priority : sk->sk_priority;
821 transmit_time = tcp_transmit_time(sk);
822 xfrm_sk_clone_policy(ctl_sk, sk);
824 ip_send_unicast_reply(ctl_sk,
825 skb, &TCP_SKB_CB(skb)->header.h4.opt,
826 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
827 &arg, arg.iov[0].iov_len,
831 xfrm_sk_free_policy(ctl_sk);
832 sock_net_set(ctl_sk, &init_net);
833 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
834 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
837 #ifdef CONFIG_TCP_MD5SIG
843 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
844 outside socket context is ugly, certainly. What can I do?
847 static void tcp_v4_send_ack(const struct sock *sk,
848 struct sk_buff *skb, u32 seq, u32 ack,
849 u32 win, u32 tsval, u32 tsecr, int oif,
850 struct tcp_md5sig_key *key,
851 int reply_flags, u8 tos)
853 const struct tcphdr *th = tcp_hdr(skb);
856 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
857 #ifdef CONFIG_TCP_MD5SIG
858 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
862 struct net *net = sock_net(sk);
863 struct ip_reply_arg arg;
867 memset(&rep.th, 0, sizeof(struct tcphdr));
868 memset(&arg, 0, sizeof(arg));
870 arg.iov[0].iov_base = (unsigned char *)&rep;
871 arg.iov[0].iov_len = sizeof(rep.th);
873 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
874 (TCPOPT_TIMESTAMP << 8) |
876 rep.opt[1] = htonl(tsval);
877 rep.opt[2] = htonl(tsecr);
878 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
881 /* Swap the send and the receive. */
882 rep.th.dest = th->source;
883 rep.th.source = th->dest;
884 rep.th.doff = arg.iov[0].iov_len / 4;
885 rep.th.seq = htonl(seq);
886 rep.th.ack_seq = htonl(ack);
888 rep.th.window = htons(win);
890 #ifdef CONFIG_TCP_MD5SIG
892 int offset = (tsecr) ? 3 : 0;
894 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
896 (TCPOPT_MD5SIG << 8) |
898 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
899 rep.th.doff = arg.iov[0].iov_len/4;
901 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
902 key, ip_hdr(skb)->saddr,
903 ip_hdr(skb)->daddr, &rep.th);
906 arg.flags = reply_flags;
907 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
908 ip_hdr(skb)->saddr, /* XXX */
909 arg.iov[0].iov_len, IPPROTO_TCP, 0);
910 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
912 arg.bound_dev_if = oif;
914 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
916 ctl_sk = this_cpu_read(ipv4_tcp_sk);
917 sock_net_set(ctl_sk, net);
918 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
919 inet_twsk(sk)->tw_mark : sk->sk_mark;
920 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
921 inet_twsk(sk)->tw_priority : sk->sk_priority;
922 transmit_time = tcp_transmit_time(sk);
923 ip_send_unicast_reply(ctl_sk,
924 skb, &TCP_SKB_CB(skb)->header.h4.opt,
925 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
926 &arg, arg.iov[0].iov_len,
930 sock_net_set(ctl_sk, &init_net);
931 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
935 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
937 struct inet_timewait_sock *tw = inet_twsk(sk);
938 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
940 tcp_v4_send_ack(sk, skb,
941 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
942 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
943 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
946 tcp_twsk_md5_key(tcptw),
947 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
954 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
955 struct request_sock *req)
957 const union tcp_md5_addr *addr;
960 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
961 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
963 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
967 * The window field (SEG.WND) of every outgoing segment, with the
968 * exception of <SYN> segments, MUST be right-shifted by
969 * Rcv.Wind.Shift bits:
971 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
972 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
973 tcp_v4_send_ack(sk, skb, seq,
974 tcp_rsk(req)->rcv_nxt,
975 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
976 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
979 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
980 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
985 * Send a SYN-ACK after having received a SYN.
986 * This still operates on a request_sock only, not on a big
989 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
991 struct request_sock *req,
992 struct tcp_fastopen_cookie *foc,
993 enum tcp_synack_type synack_type,
994 struct sk_buff *syn_skb)
996 const struct inet_request_sock *ireq = inet_rsk(req);
1002 /* First, grab a route. */
1003 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1006 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1009 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1011 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1012 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1013 (inet_sk(sk)->tos & INET_ECN_MASK) :
1016 if (!INET_ECN_is_capable(tos) &&
1017 tcp_bpf_ca_needs_ecn((struct sock *)req))
1018 tos |= INET_ECN_ECT_0;
1021 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1023 rcu_dereference(ireq->ireq_opt),
1026 err = net_xmit_eval(err);
1033 * IPv4 request_sock destructor.
1035 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1037 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1040 #ifdef CONFIG_TCP_MD5SIG
1042 * RFC2385 MD5 checksumming requires a mapping of
1043 * IP address->MD5 Key.
1044 * We need to maintain these in the sk structure.
1047 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1048 EXPORT_SYMBOL(tcp_md5_needed);
1050 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1055 /* l3index always overrides non-l3index */
1056 if (old->l3index && new->l3index == 0)
1058 if (old->l3index == 0 && new->l3index)
1061 return old->prefixlen < new->prefixlen;
1064 /* Find the Key structure for an address. */
1065 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1066 const union tcp_md5_addr *addr,
1069 const struct tcp_sock *tp = tcp_sk(sk);
1070 struct tcp_md5sig_key *key;
1071 const struct tcp_md5sig_info *md5sig;
1073 struct tcp_md5sig_key *best_match = NULL;
1076 /* caller either holds rcu_read_lock() or socket lock */
1077 md5sig = rcu_dereference_check(tp->md5sig_info,
1078 lockdep_sock_is_held(sk));
1082 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1083 lockdep_sock_is_held(sk)) {
1084 if (key->family != family)
1086 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1088 if (family == AF_INET) {
1089 mask = inet_make_mask(key->prefixlen);
1090 match = (key->addr.a4.s_addr & mask) ==
1091 (addr->a4.s_addr & mask);
1092 #if IS_ENABLED(CONFIG_IPV6)
1093 } else if (family == AF_INET6) {
1094 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1101 if (match && better_md5_match(best_match, key))
1106 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1108 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1109 const union tcp_md5_addr *addr,
1110 int family, u8 prefixlen,
1111 int l3index, u8 flags)
1113 const struct tcp_sock *tp = tcp_sk(sk);
1114 struct tcp_md5sig_key *key;
1115 unsigned int size = sizeof(struct in_addr);
1116 const struct tcp_md5sig_info *md5sig;
1118 /* caller either holds rcu_read_lock() or socket lock */
1119 md5sig = rcu_dereference_check(tp->md5sig_info,
1120 lockdep_sock_is_held(sk));
1123 #if IS_ENABLED(CONFIG_IPV6)
1124 if (family == AF_INET6)
1125 size = sizeof(struct in6_addr);
1127 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1128 lockdep_sock_is_held(sk)) {
1129 if (key->family != family)
1131 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1133 if (key->l3index != l3index)
1135 if (!memcmp(&key->addr, addr, size) &&
1136 key->prefixlen == prefixlen)
1142 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1143 const struct sock *addr_sk)
1145 const union tcp_md5_addr *addr;
1148 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1149 addr_sk->sk_bound_dev_if);
1150 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1151 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1153 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1155 /* This can be called on a newly created socket, from other files */
1156 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1157 int family, u8 prefixlen, int l3index, u8 flags,
1158 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1160 /* Add Key to the list */
1161 struct tcp_md5sig_key *key;
1162 struct tcp_sock *tp = tcp_sk(sk);
1163 struct tcp_md5sig_info *md5sig;
1165 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1167 /* Pre-existing entry - just update that one.
1168 * Note that the key might be used concurrently.
1169 * data_race() is telling kcsan that we do not care of
1170 * key mismatches, since changing MD5 key on live flows
1171 * can lead to packet drops.
1173 data_race(memcpy(key->key, newkey, newkeylen));
1175 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1176 * Also note that a reader could catch new key->keylen value
1177 * but old key->key[], this is the reason we use __GFP_ZERO
1178 * at sock_kmalloc() time below these lines.
1180 WRITE_ONCE(key->keylen, newkeylen);
1185 md5sig = rcu_dereference_protected(tp->md5sig_info,
1186 lockdep_sock_is_held(sk));
1188 md5sig = kmalloc(sizeof(*md5sig), gfp);
1193 INIT_HLIST_HEAD(&md5sig->head);
1194 rcu_assign_pointer(tp->md5sig_info, md5sig);
1197 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1200 if (!tcp_alloc_md5sig_pool()) {
1201 sock_kfree_s(sk, key, sizeof(*key));
1205 memcpy(key->key, newkey, newkeylen);
1206 key->keylen = newkeylen;
1207 key->family = family;
1208 key->prefixlen = prefixlen;
1209 key->l3index = l3index;
1211 memcpy(&key->addr, addr,
1212 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1213 sizeof(struct in_addr));
1214 hlist_add_head_rcu(&key->node, &md5sig->head);
1217 EXPORT_SYMBOL(tcp_md5_do_add);
1219 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1220 u8 prefixlen, int l3index, u8 flags)
1222 struct tcp_md5sig_key *key;
1224 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1227 hlist_del_rcu(&key->node);
1228 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1229 kfree_rcu(key, rcu);
1232 EXPORT_SYMBOL(tcp_md5_do_del);
1234 static void tcp_clear_md5_list(struct sock *sk)
1236 struct tcp_sock *tp = tcp_sk(sk);
1237 struct tcp_md5sig_key *key;
1238 struct hlist_node *n;
1239 struct tcp_md5sig_info *md5sig;
1241 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1243 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1244 hlist_del_rcu(&key->node);
1245 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1246 kfree_rcu(key, rcu);
1250 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1251 sockptr_t optval, int optlen)
1253 struct tcp_md5sig cmd;
1254 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1255 const union tcp_md5_addr *addr;
1260 if (optlen < sizeof(cmd))
1263 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1266 if (sin->sin_family != AF_INET)
1269 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1271 if (optname == TCP_MD5SIG_EXT &&
1272 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1273 prefixlen = cmd.tcpm_prefixlen;
1278 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1279 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1280 struct net_device *dev;
1283 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1284 if (dev && netif_is_l3_master(dev))
1285 l3index = dev->ifindex;
1289 /* ok to reference set/not set outside of rcu;
1290 * right now device MUST be an L3 master
1292 if (!dev || !l3index)
1296 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1298 if (!cmd.tcpm_keylen)
1299 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1301 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1304 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1305 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1308 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1309 __be32 daddr, __be32 saddr,
1310 const struct tcphdr *th, int nbytes)
1312 struct tcp4_pseudohdr *bp;
1313 struct scatterlist sg;
1320 bp->protocol = IPPROTO_TCP;
1321 bp->len = cpu_to_be16(nbytes);
1323 _th = (struct tcphdr *)(bp + 1);
1324 memcpy(_th, th, sizeof(*th));
1327 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1328 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1329 sizeof(*bp) + sizeof(*th));
1330 return crypto_ahash_update(hp->md5_req);
1333 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1334 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1336 struct tcp_md5sig_pool *hp;
1337 struct ahash_request *req;
1339 hp = tcp_get_md5sig_pool();
1341 goto clear_hash_noput;
1344 if (crypto_ahash_init(req))
1346 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1348 if (tcp_md5_hash_key(hp, key))
1350 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1351 if (crypto_ahash_final(req))
1354 tcp_put_md5sig_pool();
1358 tcp_put_md5sig_pool();
1360 memset(md5_hash, 0, 16);
1364 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1365 const struct sock *sk,
1366 const struct sk_buff *skb)
1368 struct tcp_md5sig_pool *hp;
1369 struct ahash_request *req;
1370 const struct tcphdr *th = tcp_hdr(skb);
1371 __be32 saddr, daddr;
1373 if (sk) { /* valid for establish/request sockets */
1374 saddr = sk->sk_rcv_saddr;
1375 daddr = sk->sk_daddr;
1377 const struct iphdr *iph = ip_hdr(skb);
1382 hp = tcp_get_md5sig_pool();
1384 goto clear_hash_noput;
1387 if (crypto_ahash_init(req))
1390 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1392 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1394 if (tcp_md5_hash_key(hp, key))
1396 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1397 if (crypto_ahash_final(req))
1400 tcp_put_md5sig_pool();
1404 tcp_put_md5sig_pool();
1406 memset(md5_hash, 0, 16);
1409 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1413 static void tcp_v4_init_req(struct request_sock *req,
1414 const struct sock *sk_listener,
1415 struct sk_buff *skb)
1417 struct inet_request_sock *ireq = inet_rsk(req);
1418 struct net *net = sock_net(sk_listener);
1420 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1421 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1422 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1425 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1426 struct sk_buff *skb,
1428 struct request_sock *req)
1430 tcp_v4_init_req(req, sk, skb);
1432 if (security_inet_conn_request(sk, skb, req))
1435 return inet_csk_route_req(sk, &fl->u.ip4, req);
1438 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1440 .obj_size = sizeof(struct tcp_request_sock),
1441 .rtx_syn_ack = tcp_rtx_synack,
1442 .send_ack = tcp_v4_reqsk_send_ack,
1443 .destructor = tcp_v4_reqsk_destructor,
1444 .send_reset = tcp_v4_send_reset,
1445 .syn_ack_timeout = tcp_syn_ack_timeout,
1448 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1449 .mss_clamp = TCP_MSS_DEFAULT,
1450 #ifdef CONFIG_TCP_MD5SIG
1451 .req_md5_lookup = tcp_v4_md5_lookup,
1452 .calc_md5_hash = tcp_v4_md5_hash_skb,
1454 #ifdef CONFIG_SYN_COOKIES
1455 .cookie_init_seq = cookie_v4_init_sequence,
1457 .route_req = tcp_v4_route_req,
1458 .init_seq = tcp_v4_init_seq,
1459 .init_ts_off = tcp_v4_init_ts_off,
1460 .send_synack = tcp_v4_send_synack,
1463 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1465 /* Never answer to SYNs send to broadcast or multicast */
1466 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1469 return tcp_conn_request(&tcp_request_sock_ops,
1470 &tcp_request_sock_ipv4_ops, sk, skb);
1476 EXPORT_SYMBOL(tcp_v4_conn_request);
1480 * The three way handshake has completed - we got a valid synack -
1481 * now create the new socket.
1483 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1484 struct request_sock *req,
1485 struct dst_entry *dst,
1486 struct request_sock *req_unhash,
1489 struct inet_request_sock *ireq;
1490 bool found_dup_sk = false;
1491 struct inet_sock *newinet;
1492 struct tcp_sock *newtp;
1494 #ifdef CONFIG_TCP_MD5SIG
1495 const union tcp_md5_addr *addr;
1496 struct tcp_md5sig_key *key;
1499 struct ip_options_rcu *inet_opt;
1501 if (sk_acceptq_is_full(sk))
1504 newsk = tcp_create_openreq_child(sk, req, skb);
1508 newsk->sk_gso_type = SKB_GSO_TCPV4;
1509 inet_sk_rx_dst_set(newsk, skb);
1511 newtp = tcp_sk(newsk);
1512 newinet = inet_sk(newsk);
1513 ireq = inet_rsk(req);
1514 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1515 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1516 newsk->sk_bound_dev_if = ireq->ir_iif;
1517 newinet->inet_saddr = ireq->ir_loc_addr;
1518 inet_opt = rcu_dereference(ireq->ireq_opt);
1519 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1520 newinet->mc_index = inet_iif(skb);
1521 newinet->mc_ttl = ip_hdr(skb)->ttl;
1522 newinet->rcv_tos = ip_hdr(skb)->tos;
1523 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1525 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1526 newinet->inet_id = prandom_u32();
1528 /* Set ToS of the new socket based upon the value of incoming SYN.
1529 * ECT bits are set later in tcp_init_transfer().
1531 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1532 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1535 dst = inet_csk_route_child_sock(sk, newsk, req);
1539 /* syncookie case : see end of cookie_v4_check() */
1541 sk_setup_caps(newsk, dst);
1543 tcp_ca_openreq_child(newsk, dst);
1545 tcp_sync_mss(newsk, dst_mtu(dst));
1546 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1548 tcp_initialize_rcv_mss(newsk);
1550 #ifdef CONFIG_TCP_MD5SIG
1551 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1552 /* Copy over the MD5 key from the original socket */
1553 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1554 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1557 * We're using one, so create a matching key
1558 * on the newsk structure. If we fail to get
1559 * memory, then we end up not copying the key
1562 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1563 key->key, key->keylen, GFP_ATOMIC);
1564 sk_gso_disable(newsk);
1568 if (__inet_inherit_port(sk, newsk) < 0)
1570 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1572 if (likely(*own_req)) {
1573 tcp_move_syn(newtp, req);
1574 ireq->ireq_opt = NULL;
1576 newinet->inet_opt = NULL;
1578 if (!req_unhash && found_dup_sk) {
1579 /* This code path should only be executed in the
1580 * syncookie case only
1582 bh_unlock_sock(newsk);
1590 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1597 newinet->inet_opt = NULL;
1598 inet_csk_prepare_forced_close(newsk);
1602 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1604 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1606 #ifdef CONFIG_SYN_COOKIES
1607 const struct tcphdr *th = tcp_hdr(skb);
1610 sk = cookie_v4_check(sk, skb);
1615 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1616 struct tcphdr *th, u32 *cookie)
1619 #ifdef CONFIG_SYN_COOKIES
1620 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1621 &tcp_request_sock_ipv4_ops, sk, th);
1623 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1624 tcp_synq_overflow(sk);
1630 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1632 /* The socket must have it's spinlock held when we get
1633 * here, unless it is a TCP_LISTEN socket.
1635 * We have a potential double-lock case here, so even when
1636 * doing backlog processing we use the BH locking scheme.
1637 * This is because we cannot sleep with the original spinlock
1640 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1642 enum skb_drop_reason reason;
1645 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1646 struct dst_entry *dst;
1648 dst = rcu_dereference_protected(sk->sk_rx_dst,
1649 lockdep_sock_is_held(sk));
1651 sock_rps_save_rxhash(sk, skb);
1652 sk_mark_napi_id(sk, skb);
1654 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1655 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1657 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1661 tcp_rcv_established(sk, skb);
1665 reason = SKB_DROP_REASON_NOT_SPECIFIED;
1666 if (tcp_checksum_complete(skb))
1669 if (sk->sk_state == TCP_LISTEN) {
1670 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1675 if (tcp_child_process(sk, nsk, skb)) {
1682 sock_rps_save_rxhash(sk, skb);
1684 if (tcp_rcv_state_process(sk, skb)) {
1691 tcp_v4_send_reset(rsk, skb);
1693 kfree_skb_reason(skb, reason);
1694 /* Be careful here. If this function gets more complicated and
1695 * gcc suffers from register pressure on the x86, sk (in %ebx)
1696 * might be destroyed here. This current version compiles correctly,
1697 * but you have been warned.
1702 reason = SKB_DROP_REASON_TCP_CSUM;
1703 trace_tcp_bad_csum(skb);
1704 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1705 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1708 EXPORT_SYMBOL(tcp_v4_do_rcv);
1710 int tcp_v4_early_demux(struct sk_buff *skb)
1712 const struct iphdr *iph;
1713 const struct tcphdr *th;
1716 if (skb->pkt_type != PACKET_HOST)
1719 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1725 if (th->doff < sizeof(struct tcphdr) / 4)
1728 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1729 iph->saddr, th->source,
1730 iph->daddr, ntohs(th->dest),
1731 skb->skb_iif, inet_sdif(skb));
1734 skb->destructor = sock_edemux;
1735 if (sk_fullsock(sk)) {
1736 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1739 dst = dst_check(dst, 0);
1741 sk->sk_rx_dst_ifindex == skb->skb_iif)
1742 skb_dst_set_noref(skb, dst);
1748 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1749 enum skb_drop_reason *reason)
1751 u32 limit, tail_gso_size, tail_gso_segs;
1752 struct skb_shared_info *shinfo;
1753 const struct tcphdr *th;
1754 struct tcphdr *thtail;
1755 struct sk_buff *tail;
1756 unsigned int hdrlen;
1762 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1763 * we can fix skb->truesize to its real value to avoid future drops.
1764 * This is valid because skb is not yet charged to the socket.
1765 * It has been noticed pure SACK packets were sometimes dropped
1766 * (if cooked by drivers without copybreak feature).
1772 if (unlikely(tcp_checksum_complete(skb))) {
1774 trace_tcp_bad_csum(skb);
1775 *reason = SKB_DROP_REASON_TCP_CSUM;
1776 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1777 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1781 /* Attempt coalescing to last skb in backlog, even if we are
1783 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1785 th = (const struct tcphdr *)skb->data;
1786 hdrlen = th->doff * 4;
1788 tail = sk->sk_backlog.tail;
1791 thtail = (struct tcphdr *)tail->data;
1793 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1794 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1795 ((TCP_SKB_CB(tail)->tcp_flags |
1796 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1797 !((TCP_SKB_CB(tail)->tcp_flags &
1798 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1799 ((TCP_SKB_CB(tail)->tcp_flags ^
1800 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1801 #ifdef CONFIG_TLS_DEVICE
1802 tail->decrypted != skb->decrypted ||
1804 thtail->doff != th->doff ||
1805 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1808 __skb_pull(skb, hdrlen);
1810 shinfo = skb_shinfo(skb);
1811 gso_size = shinfo->gso_size ?: skb->len;
1812 gso_segs = shinfo->gso_segs ?: 1;
1814 shinfo = skb_shinfo(tail);
1815 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1816 tail_gso_segs = shinfo->gso_segs ?: 1;
1818 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1819 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1821 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1822 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1823 thtail->window = th->window;
1826 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1827 * thtail->fin, so that the fast path in tcp_rcv_established()
1828 * is not entered if we append a packet with a FIN.
1829 * SYN, RST, URG are not present.
1830 * ACK is set on both packets.
1831 * PSH : we do not really care in TCP stack,
1832 * at least for 'GRO' packets.
1834 thtail->fin |= th->fin;
1835 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1837 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1838 TCP_SKB_CB(tail)->has_rxtstamp = true;
1839 tail->tstamp = skb->tstamp;
1840 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1843 /* Not as strict as GRO. We only need to carry mss max value */
1844 shinfo->gso_size = max(gso_size, tail_gso_size);
1845 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1847 sk->sk_backlog.len += delta;
1848 __NET_INC_STATS(sock_net(sk),
1849 LINUX_MIB_TCPBACKLOGCOALESCE);
1850 kfree_skb_partial(skb, fragstolen);
1853 __skb_push(skb, hdrlen);
1856 /* Only socket owner can try to collapse/prune rx queues
1857 * to reduce memory overhead, so add a little headroom here.
1858 * Few sockets backlog are possibly concurrently non empty.
1860 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1862 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1864 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1865 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1870 EXPORT_SYMBOL(tcp_add_backlog);
1872 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1874 struct tcphdr *th = (struct tcphdr *)skb->data;
1876 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1878 EXPORT_SYMBOL(tcp_filter);
1880 static void tcp_v4_restore_cb(struct sk_buff *skb)
1882 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1883 sizeof(struct inet_skb_parm));
1886 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1887 const struct tcphdr *th)
1889 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1890 * barrier() makes sure compiler wont play fool^Waliasing games.
1892 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1893 sizeof(struct inet_skb_parm));
1896 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1897 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1898 skb->len - th->doff * 4);
1899 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1900 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1901 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1902 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1903 TCP_SKB_CB(skb)->sacked = 0;
1904 TCP_SKB_CB(skb)->has_rxtstamp =
1905 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1912 int tcp_v4_rcv(struct sk_buff *skb)
1914 struct net *net = dev_net(skb->dev);
1915 enum skb_drop_reason drop_reason;
1916 int sdif = inet_sdif(skb);
1917 int dif = inet_iif(skb);
1918 const struct iphdr *iph;
1919 const struct tcphdr *th;
1924 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1925 if (skb->pkt_type != PACKET_HOST)
1928 /* Count it even if it's bad */
1929 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1931 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1934 th = (const struct tcphdr *)skb->data;
1936 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1937 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1940 if (!pskb_may_pull(skb, th->doff * 4))
1943 /* An explanation is required here, I think.
1944 * Packet length and doff are validated by header prediction,
1945 * provided case of th->doff==0 is eliminated.
1946 * So, we defer the checks. */
1948 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1951 th = (const struct tcphdr *)skb->data;
1954 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1955 th->dest, sdif, &refcounted);
1960 if (sk->sk_state == TCP_TIME_WAIT)
1963 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1964 struct request_sock *req = inet_reqsk(sk);
1965 bool req_stolen = false;
1968 sk = req->rsk_listener;
1969 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1970 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1972 drop_reason = tcp_inbound_md5_hash(sk, skb,
1973 &iph->saddr, &iph->daddr,
1974 AF_INET, dif, sdif);
1975 if (unlikely(drop_reason)) {
1976 sk_drops_add(sk, skb);
1980 if (tcp_checksum_complete(skb)) {
1984 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1985 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
1987 inet_csk_reqsk_queue_drop_and_put(sk, req);
1991 /* reuseport_migrate_sock() has already held one sk_refcnt
1995 /* We own a reference on the listener, increase it again
1996 * as we might lose it too soon.
2002 if (!tcp_filter(sk, skb)) {
2003 th = (const struct tcphdr *)skb->data;
2005 tcp_v4_fill_cb(skb, iph, th);
2006 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2008 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2013 /* Another cpu got exclusive access to req
2014 * and created a full blown socket.
2015 * Try to feed this packet to this socket
2016 * instead of discarding it.
2018 tcp_v4_restore_cb(skb);
2022 goto discard_and_relse;
2027 tcp_v4_restore_cb(skb);
2028 } else if (tcp_child_process(sk, nsk, skb)) {
2029 tcp_v4_send_reset(nsk, skb);
2030 goto discard_and_relse;
2037 if (static_branch_unlikely(&ip4_min_ttl)) {
2038 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2039 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2040 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2041 goto discard_and_relse;
2045 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2046 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2047 goto discard_and_relse;
2050 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2051 &iph->daddr, AF_INET, dif, sdif);
2053 goto discard_and_relse;
2057 if (tcp_filter(sk, skb)) {
2058 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2059 goto discard_and_relse;
2061 th = (const struct tcphdr *)skb->data;
2063 tcp_v4_fill_cb(skb, iph, th);
2067 if (sk->sk_state == TCP_LISTEN) {
2068 ret = tcp_v4_do_rcv(sk, skb);
2069 goto put_and_return;
2072 sk_incoming_cpu_update(sk);
2074 bh_lock_sock_nested(sk);
2075 tcp_segs_in(tcp_sk(sk), skb);
2077 if (!sock_owned_by_user(sk)) {
2078 ret = tcp_v4_do_rcv(sk, skb);
2080 if (tcp_add_backlog(sk, skb, &drop_reason))
2081 goto discard_and_relse;
2092 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2093 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2096 tcp_v4_fill_cb(skb, iph, th);
2098 if (tcp_checksum_complete(skb)) {
2100 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2101 trace_tcp_bad_csum(skb);
2102 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2104 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2106 tcp_v4_send_reset(NULL, skb);
2110 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2111 /* Discard frame. */
2112 kfree_skb_reason(skb, drop_reason);
2116 sk_drops_add(sk, skb);
2122 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2123 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2124 inet_twsk_put(inet_twsk(sk));
2128 tcp_v4_fill_cb(skb, iph, th);
2130 if (tcp_checksum_complete(skb)) {
2131 inet_twsk_put(inet_twsk(sk));
2134 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2136 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2139 iph->saddr, th->source,
2140 iph->daddr, th->dest,
2144 inet_twsk_deschedule_put(inet_twsk(sk));
2146 tcp_v4_restore_cb(skb);
2154 tcp_v4_timewait_ack(sk, skb);
2157 tcp_v4_send_reset(sk, skb);
2158 inet_twsk_deschedule_put(inet_twsk(sk));
2160 case TCP_TW_SUCCESS:;
2165 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2166 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2167 .twsk_unique = tcp_twsk_unique,
2168 .twsk_destructor= tcp_twsk_destructor,
2171 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2173 struct dst_entry *dst = skb_dst(skb);
2175 if (dst && dst_hold_safe(dst)) {
2176 rcu_assign_pointer(sk->sk_rx_dst, dst);
2177 sk->sk_rx_dst_ifindex = skb->skb_iif;
2180 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2182 const struct inet_connection_sock_af_ops ipv4_specific = {
2183 .queue_xmit = ip_queue_xmit,
2184 .send_check = tcp_v4_send_check,
2185 .rebuild_header = inet_sk_rebuild_header,
2186 .sk_rx_dst_set = inet_sk_rx_dst_set,
2187 .conn_request = tcp_v4_conn_request,
2188 .syn_recv_sock = tcp_v4_syn_recv_sock,
2189 .net_header_len = sizeof(struct iphdr),
2190 .setsockopt = ip_setsockopt,
2191 .getsockopt = ip_getsockopt,
2192 .addr2sockaddr = inet_csk_addr2sockaddr,
2193 .sockaddr_len = sizeof(struct sockaddr_in),
2194 .mtu_reduced = tcp_v4_mtu_reduced,
2196 EXPORT_SYMBOL(ipv4_specific);
2198 #ifdef CONFIG_TCP_MD5SIG
2199 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2200 .md5_lookup = tcp_v4_md5_lookup,
2201 .calc_md5_hash = tcp_v4_md5_hash_skb,
2202 .md5_parse = tcp_v4_parse_md5_keys,
2206 /* NOTE: A lot of things set to zero explicitly by call to
2207 * sk_alloc() so need not be done here.
2209 static int tcp_v4_init_sock(struct sock *sk)
2211 struct inet_connection_sock *icsk = inet_csk(sk);
2215 icsk->icsk_af_ops = &ipv4_specific;
2217 #ifdef CONFIG_TCP_MD5SIG
2218 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2224 void tcp_v4_destroy_sock(struct sock *sk)
2226 struct tcp_sock *tp = tcp_sk(sk);
2228 trace_tcp_destroy_sock(sk);
2230 tcp_clear_xmit_timers(sk);
2232 tcp_cleanup_congestion_control(sk);
2234 tcp_cleanup_ulp(sk);
2236 /* Cleanup up the write buffer. */
2237 tcp_write_queue_purge(sk);
2239 /* Check if we want to disable active TFO */
2240 tcp_fastopen_active_disable_ofo_check(sk);
2242 /* Cleans up our, hopefully empty, out_of_order_queue. */
2243 skb_rbtree_purge(&tp->out_of_order_queue);
2245 #ifdef CONFIG_TCP_MD5SIG
2246 /* Clean up the MD5 key list, if any */
2247 if (tp->md5sig_info) {
2248 tcp_clear_md5_list(sk);
2249 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2250 tp->md5sig_info = NULL;
2254 /* Clean up a referenced TCP bind bucket. */
2255 if (inet_csk(sk)->icsk_bind_hash)
2258 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2260 /* If socket is aborted during connect operation */
2261 tcp_free_fastopen_req(tp);
2262 tcp_fastopen_destroy_cipher(sk);
2263 tcp_saved_syn_free(tp);
2265 sk_sockets_allocated_dec(sk);
2267 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2269 #ifdef CONFIG_PROC_FS
2270 /* Proc filesystem TCP sock list dumping. */
2272 static unsigned short seq_file_family(const struct seq_file *seq);
2274 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2276 unsigned short family = seq_file_family(seq);
2278 /* AF_UNSPEC is used as a match all */
2279 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2280 net_eq(sock_net(sk), seq_file_net(seq)));
2283 /* Find a non empty bucket (starting from st->bucket)
2284 * and return the first sk from it.
2286 static void *listening_get_first(struct seq_file *seq)
2288 struct tcp_iter_state *st = seq->private;
2291 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2292 struct inet_listen_hashbucket *ilb2;
2293 struct hlist_nulls_node *node;
2296 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2297 if (hlist_nulls_empty(&ilb2->nulls_head))
2300 spin_lock(&ilb2->lock);
2301 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2302 if (seq_sk_match(seq, sk))
2305 spin_unlock(&ilb2->lock);
2311 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2312 * If "cur" is the last one in the st->bucket,
2313 * call listening_get_first() to return the first sk of the next
2316 static void *listening_get_next(struct seq_file *seq, void *cur)
2318 struct tcp_iter_state *st = seq->private;
2319 struct inet_listen_hashbucket *ilb2;
2320 struct hlist_nulls_node *node;
2321 struct sock *sk = cur;
2326 sk = sk_nulls_next(sk);
2327 sk_nulls_for_each_from(sk, node) {
2328 if (seq_sk_match(seq, sk))
2332 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2333 spin_unlock(&ilb2->lock);
2335 return listening_get_first(seq);
2338 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2340 struct tcp_iter_state *st = seq->private;
2345 rc = listening_get_first(seq);
2347 while (rc && *pos) {
2348 rc = listening_get_next(seq, rc);
2354 static inline bool empty_bucket(const struct tcp_iter_state *st)
2356 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2360 * Get first established socket starting from bucket given in st->bucket.
2361 * If st->bucket is zero, the very first socket in the hash is returned.
2363 static void *established_get_first(struct seq_file *seq)
2365 struct tcp_iter_state *st = seq->private;
2368 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2370 struct hlist_nulls_node *node;
2371 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2373 /* Lockless fast path for the common case of empty buckets */
2374 if (empty_bucket(st))
2378 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2379 if (seq_sk_match(seq, sk))
2382 spin_unlock_bh(lock);
2388 static void *established_get_next(struct seq_file *seq, void *cur)
2390 struct sock *sk = cur;
2391 struct hlist_nulls_node *node;
2392 struct tcp_iter_state *st = seq->private;
2397 sk = sk_nulls_next(sk);
2399 sk_nulls_for_each_from(sk, node) {
2400 if (seq_sk_match(seq, sk))
2404 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2406 return established_get_first(seq);
2409 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2411 struct tcp_iter_state *st = seq->private;
2415 rc = established_get_first(seq);
2418 rc = established_get_next(seq, rc);
2424 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2427 struct tcp_iter_state *st = seq->private;
2429 st->state = TCP_SEQ_STATE_LISTENING;
2430 rc = listening_get_idx(seq, &pos);
2433 st->state = TCP_SEQ_STATE_ESTABLISHED;
2434 rc = established_get_idx(seq, pos);
2440 static void *tcp_seek_last_pos(struct seq_file *seq)
2442 struct tcp_iter_state *st = seq->private;
2443 int bucket = st->bucket;
2444 int offset = st->offset;
2445 int orig_num = st->num;
2448 switch (st->state) {
2449 case TCP_SEQ_STATE_LISTENING:
2450 if (st->bucket > tcp_hashinfo.lhash2_mask)
2452 st->state = TCP_SEQ_STATE_LISTENING;
2453 rc = listening_get_first(seq);
2454 while (offset-- && rc && bucket == st->bucket)
2455 rc = listening_get_next(seq, rc);
2459 st->state = TCP_SEQ_STATE_ESTABLISHED;
2461 case TCP_SEQ_STATE_ESTABLISHED:
2462 if (st->bucket > tcp_hashinfo.ehash_mask)
2464 rc = established_get_first(seq);
2465 while (offset-- && rc && bucket == st->bucket)
2466 rc = established_get_next(seq, rc);
2474 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2476 struct tcp_iter_state *st = seq->private;
2479 if (*pos && *pos == st->last_pos) {
2480 rc = tcp_seek_last_pos(seq);
2485 st->state = TCP_SEQ_STATE_LISTENING;
2489 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2492 st->last_pos = *pos;
2495 EXPORT_SYMBOL(tcp_seq_start);
2497 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2499 struct tcp_iter_state *st = seq->private;
2502 if (v == SEQ_START_TOKEN) {
2503 rc = tcp_get_idx(seq, 0);
2507 switch (st->state) {
2508 case TCP_SEQ_STATE_LISTENING:
2509 rc = listening_get_next(seq, v);
2511 st->state = TCP_SEQ_STATE_ESTABLISHED;
2514 rc = established_get_first(seq);
2517 case TCP_SEQ_STATE_ESTABLISHED:
2518 rc = established_get_next(seq, v);
2523 st->last_pos = *pos;
2526 EXPORT_SYMBOL(tcp_seq_next);
2528 void tcp_seq_stop(struct seq_file *seq, void *v)
2530 struct tcp_iter_state *st = seq->private;
2532 switch (st->state) {
2533 case TCP_SEQ_STATE_LISTENING:
2534 if (v != SEQ_START_TOKEN)
2535 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2537 case TCP_SEQ_STATE_ESTABLISHED:
2539 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2543 EXPORT_SYMBOL(tcp_seq_stop);
2545 static void get_openreq4(const struct request_sock *req,
2546 struct seq_file *f, int i)
2548 const struct inet_request_sock *ireq = inet_rsk(req);
2549 long delta = req->rsk_timer.expires - jiffies;
2551 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2552 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2557 ntohs(ireq->ir_rmt_port),
2559 0, 0, /* could print option size, but that is af dependent. */
2560 1, /* timers active (only the expire timer) */
2561 jiffies_delta_to_clock_t(delta),
2563 from_kuid_munged(seq_user_ns(f),
2564 sock_i_uid(req->rsk_listener)),
2565 0, /* non standard timer */
2566 0, /* open_requests have no inode */
2571 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2574 unsigned long timer_expires;
2575 const struct tcp_sock *tp = tcp_sk(sk);
2576 const struct inet_connection_sock *icsk = inet_csk(sk);
2577 const struct inet_sock *inet = inet_sk(sk);
2578 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2579 __be32 dest = inet->inet_daddr;
2580 __be32 src = inet->inet_rcv_saddr;
2581 __u16 destp = ntohs(inet->inet_dport);
2582 __u16 srcp = ntohs(inet->inet_sport);
2586 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2587 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2588 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2590 timer_expires = icsk->icsk_timeout;
2591 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2593 timer_expires = icsk->icsk_timeout;
2594 } else if (timer_pending(&sk->sk_timer)) {
2596 timer_expires = sk->sk_timer.expires;
2599 timer_expires = jiffies;
2602 state = inet_sk_state_load(sk);
2603 if (state == TCP_LISTEN)
2604 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2606 /* Because we don't lock the socket,
2607 * we might find a transient negative value.
2609 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2610 READ_ONCE(tp->copied_seq), 0);
2612 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2613 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2614 i, src, srcp, dest, destp, state,
2615 READ_ONCE(tp->write_seq) - tp->snd_una,
2618 jiffies_delta_to_clock_t(timer_expires - jiffies),
2619 icsk->icsk_retransmits,
2620 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2621 icsk->icsk_probes_out,
2623 refcount_read(&sk->sk_refcnt), sk,
2624 jiffies_to_clock_t(icsk->icsk_rto),
2625 jiffies_to_clock_t(icsk->icsk_ack.ato),
2626 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2628 state == TCP_LISTEN ?
2629 fastopenq->max_qlen :
2630 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2633 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2634 struct seq_file *f, int i)
2636 long delta = tw->tw_timer.expires - jiffies;
2640 dest = tw->tw_daddr;
2641 src = tw->tw_rcv_saddr;
2642 destp = ntohs(tw->tw_dport);
2643 srcp = ntohs(tw->tw_sport);
2645 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2646 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2647 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2648 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2649 refcount_read(&tw->tw_refcnt), tw);
2654 static int tcp4_seq_show(struct seq_file *seq, void *v)
2656 struct tcp_iter_state *st;
2657 struct sock *sk = v;
2659 seq_setwidth(seq, TMPSZ - 1);
2660 if (v == SEQ_START_TOKEN) {
2661 seq_puts(seq, " sl local_address rem_address st tx_queue "
2662 "rx_queue tr tm->when retrnsmt uid timeout "
2668 if (sk->sk_state == TCP_TIME_WAIT)
2669 get_timewait4_sock(v, seq, st->num);
2670 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2671 get_openreq4(v, seq, st->num);
2673 get_tcp4_sock(v, seq, st->num);
2679 #ifdef CONFIG_BPF_SYSCALL
2680 struct bpf_tcp_iter_state {
2681 struct tcp_iter_state state;
2682 unsigned int cur_sk;
2683 unsigned int end_sk;
2684 unsigned int max_sk;
2685 struct sock **batch;
2686 bool st_bucket_done;
2689 struct bpf_iter__tcp {
2690 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2691 __bpf_md_ptr(struct sock_common *, sk_common);
2692 uid_t uid __aligned(8);
2695 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2696 struct sock_common *sk_common, uid_t uid)
2698 struct bpf_iter__tcp ctx;
2700 meta->seq_num--; /* skip SEQ_START_TOKEN */
2702 ctx.sk_common = sk_common;
2704 return bpf_iter_run_prog(prog, &ctx);
2707 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2709 while (iter->cur_sk < iter->end_sk)
2710 sock_put(iter->batch[iter->cur_sk++]);
2713 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2714 unsigned int new_batch_sz)
2716 struct sock **new_batch;
2718 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2719 GFP_USER | __GFP_NOWARN);
2723 bpf_iter_tcp_put_batch(iter);
2724 kvfree(iter->batch);
2725 iter->batch = new_batch;
2726 iter->max_sk = new_batch_sz;
2731 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2732 struct sock *start_sk)
2734 struct bpf_tcp_iter_state *iter = seq->private;
2735 struct tcp_iter_state *st = &iter->state;
2736 struct hlist_nulls_node *node;
2737 unsigned int expected = 1;
2740 sock_hold(start_sk);
2741 iter->batch[iter->end_sk++] = start_sk;
2743 sk = sk_nulls_next(start_sk);
2744 sk_nulls_for_each_from(sk, node) {
2745 if (seq_sk_match(seq, sk)) {
2746 if (iter->end_sk < iter->max_sk) {
2748 iter->batch[iter->end_sk++] = sk;
2753 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2758 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2759 struct sock *start_sk)
2761 struct bpf_tcp_iter_state *iter = seq->private;
2762 struct tcp_iter_state *st = &iter->state;
2763 struct hlist_nulls_node *node;
2764 unsigned int expected = 1;
2767 sock_hold(start_sk);
2768 iter->batch[iter->end_sk++] = start_sk;
2770 sk = sk_nulls_next(start_sk);
2771 sk_nulls_for_each_from(sk, node) {
2772 if (seq_sk_match(seq, sk)) {
2773 if (iter->end_sk < iter->max_sk) {
2775 iter->batch[iter->end_sk++] = sk;
2780 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2785 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2787 struct bpf_tcp_iter_state *iter = seq->private;
2788 struct tcp_iter_state *st = &iter->state;
2789 unsigned int expected;
2790 bool resized = false;
2793 /* The st->bucket is done. Directly advance to the next
2794 * bucket instead of having the tcp_seek_last_pos() to skip
2795 * one by one in the current bucket and eventually find out
2796 * it has to advance to the next bucket.
2798 if (iter->st_bucket_done) {
2801 if (st->state == TCP_SEQ_STATE_LISTENING &&
2802 st->bucket > tcp_hashinfo.lhash2_mask) {
2803 st->state = TCP_SEQ_STATE_ESTABLISHED;
2809 /* Get a new batch */
2812 iter->st_bucket_done = false;
2814 sk = tcp_seek_last_pos(seq);
2816 return NULL; /* Done */
2818 if (st->state == TCP_SEQ_STATE_LISTENING)
2819 expected = bpf_iter_tcp_listening_batch(seq, sk);
2821 expected = bpf_iter_tcp_established_batch(seq, sk);
2823 if (iter->end_sk == expected) {
2824 iter->st_bucket_done = true;
2828 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2836 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2838 /* bpf iter does not support lseek, so it always
2839 * continue from where it was stop()-ped.
2842 return bpf_iter_tcp_batch(seq);
2844 return SEQ_START_TOKEN;
2847 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2849 struct bpf_tcp_iter_state *iter = seq->private;
2850 struct tcp_iter_state *st = &iter->state;
2853 /* Whenever seq_next() is called, the iter->cur_sk is
2854 * done with seq_show(), so advance to the next sk in
2857 if (iter->cur_sk < iter->end_sk) {
2858 /* Keeping st->num consistent in tcp_iter_state.
2859 * bpf_iter_tcp does not use st->num.
2860 * meta.seq_num is used instead.
2863 /* Move st->offset to the next sk in the bucket such that
2864 * the future start() will resume at st->offset in
2865 * st->bucket. See tcp_seek_last_pos().
2868 sock_put(iter->batch[iter->cur_sk++]);
2871 if (iter->cur_sk < iter->end_sk)
2872 sk = iter->batch[iter->cur_sk];
2874 sk = bpf_iter_tcp_batch(seq);
2877 /* Keeping st->last_pos consistent in tcp_iter_state.
2878 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2880 st->last_pos = *pos;
2884 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2886 struct bpf_iter_meta meta;
2887 struct bpf_prog *prog;
2888 struct sock *sk = v;
2893 if (v == SEQ_START_TOKEN)
2896 if (sk_fullsock(sk))
2897 slow = lock_sock_fast(sk);
2899 if (unlikely(sk_unhashed(sk))) {
2904 if (sk->sk_state == TCP_TIME_WAIT) {
2906 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2907 const struct request_sock *req = v;
2909 uid = from_kuid_munged(seq_user_ns(seq),
2910 sock_i_uid(req->rsk_listener));
2912 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2916 prog = bpf_iter_get_info(&meta, false);
2917 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2920 if (sk_fullsock(sk))
2921 unlock_sock_fast(sk, slow);
2926 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2928 struct bpf_tcp_iter_state *iter = seq->private;
2929 struct bpf_iter_meta meta;
2930 struct bpf_prog *prog;
2934 prog = bpf_iter_get_info(&meta, true);
2936 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2939 if (iter->cur_sk < iter->end_sk) {
2940 bpf_iter_tcp_put_batch(iter);
2941 iter->st_bucket_done = false;
2945 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2946 .show = bpf_iter_tcp_seq_show,
2947 .start = bpf_iter_tcp_seq_start,
2948 .next = bpf_iter_tcp_seq_next,
2949 .stop = bpf_iter_tcp_seq_stop,
2952 static unsigned short seq_file_family(const struct seq_file *seq)
2954 const struct tcp_seq_afinfo *afinfo;
2956 #ifdef CONFIG_BPF_SYSCALL
2957 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
2958 if (seq->op == &bpf_iter_tcp_seq_ops)
2962 /* Iterated from proc fs */
2963 afinfo = pde_data(file_inode(seq->file));
2964 return afinfo->family;
2967 static const struct seq_operations tcp4_seq_ops = {
2968 .show = tcp4_seq_show,
2969 .start = tcp_seq_start,
2970 .next = tcp_seq_next,
2971 .stop = tcp_seq_stop,
2974 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2978 static int __net_init tcp4_proc_init_net(struct net *net)
2980 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2981 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2986 static void __net_exit tcp4_proc_exit_net(struct net *net)
2988 remove_proc_entry("tcp", net->proc_net);
2991 static struct pernet_operations tcp4_net_ops = {
2992 .init = tcp4_proc_init_net,
2993 .exit = tcp4_proc_exit_net,
2996 int __init tcp4_proc_init(void)
2998 return register_pernet_subsys(&tcp4_net_ops);
3001 void tcp4_proc_exit(void)
3003 unregister_pernet_subsys(&tcp4_net_ops);
3005 #endif /* CONFIG_PROC_FS */
3007 /* @wake is one when sk_stream_write_space() calls us.
3008 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3009 * This mimics the strategy used in sock_def_write_space().
3011 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3013 const struct tcp_sock *tp = tcp_sk(sk);
3014 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3015 READ_ONCE(tp->snd_nxt);
3017 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3019 EXPORT_SYMBOL(tcp_stream_memory_free);
3021 struct proto tcp_prot = {
3023 .owner = THIS_MODULE,
3025 .pre_connect = tcp_v4_pre_connect,
3026 .connect = tcp_v4_connect,
3027 .disconnect = tcp_disconnect,
3028 .accept = inet_csk_accept,
3030 .init = tcp_v4_init_sock,
3031 .destroy = tcp_v4_destroy_sock,
3032 .shutdown = tcp_shutdown,
3033 .setsockopt = tcp_setsockopt,
3034 .getsockopt = tcp_getsockopt,
3035 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3036 .keepalive = tcp_set_keepalive,
3037 .recvmsg = tcp_recvmsg,
3038 .sendmsg = tcp_sendmsg,
3039 .sendpage = tcp_sendpage,
3040 .backlog_rcv = tcp_v4_do_rcv,
3041 .release_cb = tcp_release_cb,
3043 .unhash = inet_unhash,
3044 .get_port = inet_csk_get_port,
3045 .put_port = inet_put_port,
3046 #ifdef CONFIG_BPF_SYSCALL
3047 .psock_update_sk_prot = tcp_bpf_update_proto,
3049 .enter_memory_pressure = tcp_enter_memory_pressure,
3050 .leave_memory_pressure = tcp_leave_memory_pressure,
3051 .stream_memory_free = tcp_stream_memory_free,
3052 .sockets_allocated = &tcp_sockets_allocated,
3053 .orphan_count = &tcp_orphan_count,
3055 .memory_allocated = &tcp_memory_allocated,
3056 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3058 .memory_pressure = &tcp_memory_pressure,
3059 .sysctl_mem = sysctl_tcp_mem,
3060 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3061 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3062 .max_header = MAX_TCP_HEADER,
3063 .obj_size = sizeof(struct tcp_sock),
3064 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3065 .twsk_prot = &tcp_timewait_sock_ops,
3066 .rsk_prot = &tcp_request_sock_ops,
3067 .h.hashinfo = &tcp_hashinfo,
3068 .no_autobind = true,
3069 .diag_destroy = tcp_abort,
3071 EXPORT_SYMBOL(tcp_prot);
3073 static void __net_exit tcp_sk_exit(struct net *net)
3075 struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3077 if (net->ipv4.tcp_congestion_control)
3078 bpf_module_put(net->ipv4.tcp_congestion_control,
3079 net->ipv4.tcp_congestion_control->owner);
3080 if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3081 kfree(tcp_death_row);
3084 static int __net_init tcp_sk_init(struct net *net)
3088 net->ipv4.sysctl_tcp_ecn = 2;
3089 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3091 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3092 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3093 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3094 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3095 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3097 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3098 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3099 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3101 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3102 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3103 net->ipv4.sysctl_tcp_syncookies = 1;
3104 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3105 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3106 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3107 net->ipv4.sysctl_tcp_orphan_retries = 0;
3108 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3109 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3110 net->ipv4.sysctl_tcp_tw_reuse = 2;
3111 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3113 net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3114 if (!net->ipv4.tcp_death_row)
3116 refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3117 cnt = tcp_hashinfo.ehash_mask + 1;
3118 net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3119 net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3121 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3122 net->ipv4.sysctl_tcp_sack = 1;
3123 net->ipv4.sysctl_tcp_window_scaling = 1;
3124 net->ipv4.sysctl_tcp_timestamps = 1;
3125 net->ipv4.sysctl_tcp_early_retrans = 3;
3126 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3127 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3128 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3129 net->ipv4.sysctl_tcp_max_reordering = 300;
3130 net->ipv4.sysctl_tcp_dsack = 1;
3131 net->ipv4.sysctl_tcp_app_win = 31;
3132 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3133 net->ipv4.sysctl_tcp_frto = 2;
3134 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3135 /* This limits the percentage of the congestion window which we
3136 * will allow a single TSO frame to consume. Building TSO frames
3137 * which are too large can cause TCP streams to be bursty.
3139 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3140 /* Default TSQ limit of 16 TSO segments */
3141 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3142 /* rfc5961 challenge ack rate limiting */
3143 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3144 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3145 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
3146 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3147 net->ipv4.sysctl_tcp_autocorking = 1;
3148 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3149 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3150 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3151 if (net != &init_net) {
3152 memcpy(net->ipv4.sysctl_tcp_rmem,
3153 init_net.ipv4.sysctl_tcp_rmem,
3154 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3155 memcpy(net->ipv4.sysctl_tcp_wmem,
3156 init_net.ipv4.sysctl_tcp_wmem,
3157 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3159 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3160 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3161 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3162 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3163 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3164 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3166 /* Reno is always built in */
3167 if (!net_eq(net, &init_net) &&
3168 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3169 init_net.ipv4.tcp_congestion_control->owner))
3170 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3172 net->ipv4.tcp_congestion_control = &tcp_reno;
3177 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3181 inet_twsk_purge(&tcp_hashinfo, AF_INET);
3183 list_for_each_entry(net, net_exit_list, exit_list)
3184 tcp_fastopen_ctx_destroy(net);
3187 static struct pernet_operations __net_initdata tcp_sk_ops = {
3188 .init = tcp_sk_init,
3189 .exit = tcp_sk_exit,
3190 .exit_batch = tcp_sk_exit_batch,
3193 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3194 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3195 struct sock_common *sk_common, uid_t uid)
3197 #define INIT_BATCH_SZ 16
3199 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3201 struct bpf_tcp_iter_state *iter = priv_data;
3204 err = bpf_iter_init_seq_net(priv_data, aux);
3208 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3210 bpf_iter_fini_seq_net(priv_data);
3217 static void bpf_iter_fini_tcp(void *priv_data)
3219 struct bpf_tcp_iter_state *iter = priv_data;
3221 bpf_iter_fini_seq_net(priv_data);
3222 kvfree(iter->batch);
3225 static const struct bpf_iter_seq_info tcp_seq_info = {
3226 .seq_ops = &bpf_iter_tcp_seq_ops,
3227 .init_seq_private = bpf_iter_init_tcp,
3228 .fini_seq_private = bpf_iter_fini_tcp,
3229 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3232 static const struct bpf_func_proto *
3233 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3234 const struct bpf_prog *prog)
3237 case BPF_FUNC_setsockopt:
3238 return &bpf_sk_setsockopt_proto;
3239 case BPF_FUNC_getsockopt:
3240 return &bpf_sk_getsockopt_proto;
3246 static struct bpf_iter_reg tcp_reg_info = {
3248 .ctx_arg_info_size = 1,
3250 { offsetof(struct bpf_iter__tcp, sk_common),
3251 PTR_TO_BTF_ID_OR_NULL },
3253 .get_func_proto = bpf_iter_tcp_get_func_proto,
3254 .seq_info = &tcp_seq_info,
3257 static void __init bpf_iter_register(void)
3259 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3260 if (bpf_iter_reg_target(&tcp_reg_info))
3261 pr_warn("Warning: could not register bpf iterator tcp\n");
3266 void __init tcp_v4_init(void)
3270 for_each_possible_cpu(cpu) {
3273 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3274 IPPROTO_TCP, &init_net);
3276 panic("Failed to create the TCP control socket.\n");
3277 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3279 /* Please enforce IP_DF and IPID==0 for RST and
3280 * ACK sent in SYN-RECV and TIME-WAIT state.
3282 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3284 per_cpu(ipv4_tcp_sk, cpu) = sk;
3286 if (register_pernet_subsys(&tcp_sk_ops))
3287 panic("Failed to create the TCP control socket.\n");
3289 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3290 bpf_iter_register();