Merge tag 'probes-v6.9' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux...
[linux-block.git] / net / ipv4 / tcp_ipv4.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              Implementation of the Transmission Control Protocol(TCP).
8  *
9  *              IPv4 specific functions
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  */
18
19 /*
20  * Changes:
21  *              David S. Miller :       New socket lookup architecture.
22  *                                      This code is dedicated to John Dyson.
23  *              David S. Miller :       Change semantics of established hash,
24  *                                      half is devoted to TIME_WAIT sockets
25  *                                      and the rest go in the other half.
26  *              Andi Kleen :            Add support for syncookies and fixed
27  *                                      some bugs: ip options weren't passed to
28  *                                      the TCP layer, missed a check for an
29  *                                      ACK bit.
30  *              Andi Kleen :            Implemented fast path mtu discovery.
31  *                                      Fixed many serious bugs in the
32  *                                      request_sock handling and moved
33  *                                      most of it into the af independent code.
34  *                                      Added tail drop and some other bugfixes.
35  *                                      Added new listen semantics.
36  *              Mike McLagan    :       Routing by source
37  *      Juan Jose Ciarlante:            ip_dynaddr bits
38  *              Andi Kleen:             various fixes.
39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
40  *                                      coma.
41  *      Andi Kleen              :       Fix new listen.
42  *      Andi Kleen              :       Fix accept error reporting.
43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
45  *                                      a single port at the same time.
46  */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/inetdevice.h>
80 #include <linux/btf_ids.h>
81
82 #include <crypto/hash.h>
83 #include <linux/scatterlist.h>
84
85 #include <trace/events/tcp.h>
86
87 #ifdef CONFIG_TCP_MD5SIG
88 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
89                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
90 #endif
91
92 struct inet_hashinfo tcp_hashinfo;
93 EXPORT_SYMBOL(tcp_hashinfo);
94
95 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96
97 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98 {
99         return secure_tcp_seq(ip_hdr(skb)->daddr,
100                               ip_hdr(skb)->saddr,
101                               tcp_hdr(skb)->dest,
102                               tcp_hdr(skb)->source);
103 }
104
105 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
106 {
107         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
108 }
109
110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 {
112         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
113         const struct inet_timewait_sock *tw = inet_twsk(sktw);
114         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115         struct tcp_sock *tp = tcp_sk(sk);
116
117         if (reuse == 2) {
118                 /* Still does not detect *everything* that goes through
119                  * lo, since we require a loopback src or dst address
120                  * or direct binding to 'lo' interface.
121                  */
122                 bool loopback = false;
123                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124                         loopback = true;
125 #if IS_ENABLED(CONFIG_IPV6)
126                 if (tw->tw_family == AF_INET6) {
127                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
128                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
129                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
130                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
131                                 loopback = true;
132                 } else
133 #endif
134                 {
135                         if (ipv4_is_loopback(tw->tw_daddr) ||
136                             ipv4_is_loopback(tw->tw_rcv_saddr))
137                                 loopback = true;
138                 }
139                 if (!loopback)
140                         reuse = 0;
141         }
142
143         /* With PAWS, it is safe from the viewpoint
144            of data integrity. Even without PAWS it is safe provided sequence
145            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
146
147            Actually, the idea is close to VJ's one, only timestamp cache is
148            held not per host, but per port pair and TW bucket is used as state
149            holder.
150
151            If TW bucket has been already destroyed we fall back to VJ's scheme
152            and use initial timestamp retrieved from peer table.
153          */
154         if (tcptw->tw_ts_recent_stamp &&
155             (!twp || (reuse && time_after32(ktime_get_seconds(),
156                                             tcptw->tw_ts_recent_stamp)))) {
157                 /* In case of repair and re-using TIME-WAIT sockets we still
158                  * want to be sure that it is safe as above but honor the
159                  * sequence numbers and time stamps set as part of the repair
160                  * process.
161                  *
162                  * Without this check re-using a TIME-WAIT socket with TCP
163                  * repair would accumulate a -1 on the repair assigned
164                  * sequence number. The first time it is reused the sequence
165                  * is -1, the second time -2, etc. This fixes that issue
166                  * without appearing to create any others.
167                  */
168                 if (likely(!tp->repair)) {
169                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
170
171                         if (!seq)
172                                 seq = 1;
173                         WRITE_ONCE(tp->write_seq, seq);
174                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
175                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
176                 }
177                 sock_hold(sktw);
178                 return 1;
179         }
180
181         return 0;
182 }
183 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
184
185 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
186                               int addr_len)
187 {
188         /* This check is replicated from tcp_v4_connect() and intended to
189          * prevent BPF program called below from accessing bytes that are out
190          * of the bound specified by user in addr_len.
191          */
192         if (addr_len < sizeof(struct sockaddr_in))
193                 return -EINVAL;
194
195         sock_owned_by_me(sk);
196
197         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
198 }
199
200 /* This will initiate an outgoing connection. */
201 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
202 {
203         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
204         struct inet_timewait_death_row *tcp_death_row;
205         struct inet_sock *inet = inet_sk(sk);
206         struct tcp_sock *tp = tcp_sk(sk);
207         struct ip_options_rcu *inet_opt;
208         struct net *net = sock_net(sk);
209         __be16 orig_sport, orig_dport;
210         __be32 daddr, nexthop;
211         struct flowi4 *fl4;
212         struct rtable *rt;
213         int err;
214
215         if (addr_len < sizeof(struct sockaddr_in))
216                 return -EINVAL;
217
218         if (usin->sin_family != AF_INET)
219                 return -EAFNOSUPPORT;
220
221         nexthop = daddr = usin->sin_addr.s_addr;
222         inet_opt = rcu_dereference_protected(inet->inet_opt,
223                                              lockdep_sock_is_held(sk));
224         if (inet_opt && inet_opt->opt.srr) {
225                 if (!daddr)
226                         return -EINVAL;
227                 nexthop = inet_opt->opt.faddr;
228         }
229
230         orig_sport = inet->inet_sport;
231         orig_dport = usin->sin_port;
232         fl4 = &inet->cork.fl.u.ip4;
233         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
234                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
235                               orig_dport, sk);
236         if (IS_ERR(rt)) {
237                 err = PTR_ERR(rt);
238                 if (err == -ENETUNREACH)
239                         IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
240                 return err;
241         }
242
243         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
244                 ip_rt_put(rt);
245                 return -ENETUNREACH;
246         }
247
248         if (!inet_opt || !inet_opt->opt.srr)
249                 daddr = fl4->daddr;
250
251         tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
252
253         if (!inet->inet_saddr) {
254                 err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
255                 if (err) {
256                         ip_rt_put(rt);
257                         return err;
258                 }
259         } else {
260                 sk_rcv_saddr_set(sk, inet->inet_saddr);
261         }
262
263         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
264                 /* Reset inherited state */
265                 tp->rx_opt.ts_recent       = 0;
266                 tp->rx_opt.ts_recent_stamp = 0;
267                 if (likely(!tp->repair))
268                         WRITE_ONCE(tp->write_seq, 0);
269         }
270
271         inet->inet_dport = usin->sin_port;
272         sk_daddr_set(sk, daddr);
273
274         inet_csk(sk)->icsk_ext_hdr_len = 0;
275         if (inet_opt)
276                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
277
278         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
279
280         /* Socket identity is still unknown (sport may be zero).
281          * However we set state to SYN-SENT and not releasing socket
282          * lock select source port, enter ourselves into the hash tables and
283          * complete initialization after this.
284          */
285         tcp_set_state(sk, TCP_SYN_SENT);
286         err = inet_hash_connect(tcp_death_row, sk);
287         if (err)
288                 goto failure;
289
290         sk_set_txhash(sk);
291
292         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
293                                inet->inet_sport, inet->inet_dport, sk);
294         if (IS_ERR(rt)) {
295                 err = PTR_ERR(rt);
296                 rt = NULL;
297                 goto failure;
298         }
299         tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
300         /* OK, now commit destination to socket.  */
301         sk->sk_gso_type = SKB_GSO_TCPV4;
302         sk_setup_caps(sk, &rt->dst);
303         rt = NULL;
304
305         if (likely(!tp->repair)) {
306                 if (!tp->write_seq)
307                         WRITE_ONCE(tp->write_seq,
308                                    secure_tcp_seq(inet->inet_saddr,
309                                                   inet->inet_daddr,
310                                                   inet->inet_sport,
311                                                   usin->sin_port));
312                 WRITE_ONCE(tp->tsoffset,
313                            secure_tcp_ts_off(net, inet->inet_saddr,
314                                              inet->inet_daddr));
315         }
316
317         atomic_set(&inet->inet_id, get_random_u16());
318
319         if (tcp_fastopen_defer_connect(sk, &err))
320                 return err;
321         if (err)
322                 goto failure;
323
324         err = tcp_connect(sk);
325
326         if (err)
327                 goto failure;
328
329         return 0;
330
331 failure:
332         /*
333          * This unhashes the socket and releases the local port,
334          * if necessary.
335          */
336         tcp_set_state(sk, TCP_CLOSE);
337         inet_bhash2_reset_saddr(sk);
338         ip_rt_put(rt);
339         sk->sk_route_caps = 0;
340         inet->inet_dport = 0;
341         return err;
342 }
343 EXPORT_SYMBOL(tcp_v4_connect);
344
345 /*
346  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
347  * It can be called through tcp_release_cb() if socket was owned by user
348  * at the time tcp_v4_err() was called to handle ICMP message.
349  */
350 void tcp_v4_mtu_reduced(struct sock *sk)
351 {
352         struct inet_sock *inet = inet_sk(sk);
353         struct dst_entry *dst;
354         u32 mtu;
355
356         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
357                 return;
358         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
359         dst = inet_csk_update_pmtu(sk, mtu);
360         if (!dst)
361                 return;
362
363         /* Something is about to be wrong... Remember soft error
364          * for the case, if this connection will not able to recover.
365          */
366         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
367                 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
368
369         mtu = dst_mtu(dst);
370
371         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
372             ip_sk_accept_pmtu(sk) &&
373             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
374                 tcp_sync_mss(sk, mtu);
375
376                 /* Resend the TCP packet because it's
377                  * clear that the old packet has been
378                  * dropped. This is the new "fast" path mtu
379                  * discovery.
380                  */
381                 tcp_simple_retransmit(sk);
382         } /* else let the usual retransmit timer handle it */
383 }
384 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
385
386 static void do_redirect(struct sk_buff *skb, struct sock *sk)
387 {
388         struct dst_entry *dst = __sk_dst_check(sk, 0);
389
390         if (dst)
391                 dst->ops->redirect(dst, sk, skb);
392 }
393
394
395 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
396 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
397 {
398         struct request_sock *req = inet_reqsk(sk);
399         struct net *net = sock_net(sk);
400
401         /* ICMPs are not backlogged, hence we cannot get
402          * an established socket here.
403          */
404         if (seq != tcp_rsk(req)->snt_isn) {
405                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
406         } else if (abort) {
407                 /*
408                  * Still in SYN_RECV, just remove it silently.
409                  * There is no good way to pass the error to the newly
410                  * created socket, and POSIX does not want network
411                  * errors returned from accept().
412                  */
413                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
414                 tcp_listendrop(req->rsk_listener);
415         }
416         reqsk_put(req);
417 }
418 EXPORT_SYMBOL(tcp_req_err);
419
420 /* TCP-LD (RFC 6069) logic */
421 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
422 {
423         struct inet_connection_sock *icsk = inet_csk(sk);
424         struct tcp_sock *tp = tcp_sk(sk);
425         struct sk_buff *skb;
426         s32 remaining;
427         u32 delta_us;
428
429         if (sock_owned_by_user(sk))
430                 return;
431
432         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
433             !icsk->icsk_backoff)
434                 return;
435
436         skb = tcp_rtx_queue_head(sk);
437         if (WARN_ON_ONCE(!skb))
438                 return;
439
440         icsk->icsk_backoff--;
441         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
442         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
443
444         tcp_mstamp_refresh(tp);
445         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
446         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
447
448         if (remaining > 0) {
449                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
450                                           remaining, TCP_RTO_MAX);
451         } else {
452                 /* RTO revert clocked out retransmission.
453                  * Will retransmit now.
454                  */
455                 tcp_retransmit_timer(sk);
456         }
457 }
458 EXPORT_SYMBOL(tcp_ld_RTO_revert);
459
460 /*
461  * This routine is called by the ICMP module when it gets some
462  * sort of error condition.  If err < 0 then the socket should
463  * be closed and the error returned to the user.  If err > 0
464  * it's just the icmp type << 8 | icmp code.  After adjustment
465  * header points to the first 8 bytes of the tcp header.  We need
466  * to find the appropriate port.
467  *
468  * The locking strategy used here is very "optimistic". When
469  * someone else accesses the socket the ICMP is just dropped
470  * and for some paths there is no check at all.
471  * A more general error queue to queue errors for later handling
472  * is probably better.
473  *
474  */
475
476 int tcp_v4_err(struct sk_buff *skb, u32 info)
477 {
478         const struct iphdr *iph = (const struct iphdr *)skb->data;
479         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
480         struct tcp_sock *tp;
481         const int type = icmp_hdr(skb)->type;
482         const int code = icmp_hdr(skb)->code;
483         struct sock *sk;
484         struct request_sock *fastopen;
485         u32 seq, snd_una;
486         int err;
487         struct net *net = dev_net(skb->dev);
488
489         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
490                                        iph->daddr, th->dest, iph->saddr,
491                                        ntohs(th->source), inet_iif(skb), 0);
492         if (!sk) {
493                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
494                 return -ENOENT;
495         }
496         if (sk->sk_state == TCP_TIME_WAIT) {
497                 /* To increase the counter of ignored icmps for TCP-AO */
498                 tcp_ao_ignore_icmp(sk, AF_INET, type, code);
499                 inet_twsk_put(inet_twsk(sk));
500                 return 0;
501         }
502         seq = ntohl(th->seq);
503         if (sk->sk_state == TCP_NEW_SYN_RECV) {
504                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
505                                      type == ICMP_TIME_EXCEEDED ||
506                                      (type == ICMP_DEST_UNREACH &&
507                                       (code == ICMP_NET_UNREACH ||
508                                        code == ICMP_HOST_UNREACH)));
509                 return 0;
510         }
511
512         if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
513                 sock_put(sk);
514                 return 0;
515         }
516
517         bh_lock_sock(sk);
518         /* If too many ICMPs get dropped on busy
519          * servers this needs to be solved differently.
520          * We do take care of PMTU discovery (RFC1191) special case :
521          * we can receive locally generated ICMP messages while socket is held.
522          */
523         if (sock_owned_by_user(sk)) {
524                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
525                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
526         }
527         if (sk->sk_state == TCP_CLOSE)
528                 goto out;
529
530         if (static_branch_unlikely(&ip4_min_ttl)) {
531                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
532                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
533                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
534                         goto out;
535                 }
536         }
537
538         tp = tcp_sk(sk);
539         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
540         fastopen = rcu_dereference(tp->fastopen_rsk);
541         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
542         if (sk->sk_state != TCP_LISTEN &&
543             !between(seq, snd_una, tp->snd_nxt)) {
544                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
545                 goto out;
546         }
547
548         switch (type) {
549         case ICMP_REDIRECT:
550                 if (!sock_owned_by_user(sk))
551                         do_redirect(skb, sk);
552                 goto out;
553         case ICMP_SOURCE_QUENCH:
554                 /* Just silently ignore these. */
555                 goto out;
556         case ICMP_PARAMETERPROB:
557                 err = EPROTO;
558                 break;
559         case ICMP_DEST_UNREACH:
560                 if (code > NR_ICMP_UNREACH)
561                         goto out;
562
563                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
564                         /* We are not interested in TCP_LISTEN and open_requests
565                          * (SYN-ACKs send out by Linux are always <576bytes so
566                          * they should go through unfragmented).
567                          */
568                         if (sk->sk_state == TCP_LISTEN)
569                                 goto out;
570
571                         WRITE_ONCE(tp->mtu_info, info);
572                         if (!sock_owned_by_user(sk)) {
573                                 tcp_v4_mtu_reduced(sk);
574                         } else {
575                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
576                                         sock_hold(sk);
577                         }
578                         goto out;
579                 }
580
581                 err = icmp_err_convert[code].errno;
582                 /* check if this ICMP message allows revert of backoff.
583                  * (see RFC 6069)
584                  */
585                 if (!fastopen &&
586                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
587                         tcp_ld_RTO_revert(sk, seq);
588                 break;
589         case ICMP_TIME_EXCEEDED:
590                 err = EHOSTUNREACH;
591                 break;
592         default:
593                 goto out;
594         }
595
596         switch (sk->sk_state) {
597         case TCP_SYN_SENT:
598         case TCP_SYN_RECV:
599                 /* Only in fast or simultaneous open. If a fast open socket is
600                  * already accepted it is treated as a connected one below.
601                  */
602                 if (fastopen && !fastopen->sk)
603                         break;
604
605                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
606
607                 if (!sock_owned_by_user(sk)) {
608                         WRITE_ONCE(sk->sk_err, err);
609
610                         sk_error_report(sk);
611
612                         tcp_done(sk);
613                 } else {
614                         WRITE_ONCE(sk->sk_err_soft, err);
615                 }
616                 goto out;
617         }
618
619         /* If we've already connected we will keep trying
620          * until we time out, or the user gives up.
621          *
622          * rfc1122 4.2.3.9 allows to consider as hard errors
623          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
624          * but it is obsoleted by pmtu discovery).
625          *
626          * Note, that in modern internet, where routing is unreliable
627          * and in each dark corner broken firewalls sit, sending random
628          * errors ordered by their masters even this two messages finally lose
629          * their original sense (even Linux sends invalid PORT_UNREACHs)
630          *
631          * Now we are in compliance with RFCs.
632          *                                                      --ANK (980905)
633          */
634
635         if (!sock_owned_by_user(sk) &&
636             inet_test_bit(RECVERR, sk)) {
637                 WRITE_ONCE(sk->sk_err, err);
638                 sk_error_report(sk);
639         } else  { /* Only an error on timeout */
640                 WRITE_ONCE(sk->sk_err_soft, err);
641         }
642
643 out:
644         bh_unlock_sock(sk);
645         sock_put(sk);
646         return 0;
647 }
648
649 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
650 {
651         struct tcphdr *th = tcp_hdr(skb);
652
653         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
654         skb->csum_start = skb_transport_header(skb) - skb->head;
655         skb->csum_offset = offsetof(struct tcphdr, check);
656 }
657
658 /* This routine computes an IPv4 TCP checksum. */
659 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
660 {
661         const struct inet_sock *inet = inet_sk(sk);
662
663         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
664 }
665 EXPORT_SYMBOL(tcp_v4_send_check);
666
667 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
668
669 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
670                                  const struct tcp_ao_hdr *aoh,
671                                  struct ip_reply_arg *arg, struct tcphdr *reply,
672                                  __be32 reply_options[REPLY_OPTIONS_LEN])
673 {
674 #ifdef CONFIG_TCP_AO
675         int sdif = tcp_v4_sdif(skb);
676         int dif = inet_iif(skb);
677         int l3index = sdif ? dif : 0;
678         bool allocated_traffic_key;
679         struct tcp_ao_key *key;
680         char *traffic_key;
681         bool drop = true;
682         u32 ao_sne = 0;
683         u8 keyid;
684
685         rcu_read_lock();
686         if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
687                                  &key, &traffic_key, &allocated_traffic_key,
688                                  &keyid, &ao_sne))
689                 goto out;
690
691         reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
692                                  (aoh->rnext_keyid << 8) | keyid);
693         arg->iov[0].iov_len += tcp_ao_len_aligned(key);
694         reply->doff = arg->iov[0].iov_len / 4;
695
696         if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
697                             key, traffic_key,
698                             (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
699                             (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
700                             reply, ao_sne))
701                 goto out;
702         drop = false;
703 out:
704         rcu_read_unlock();
705         if (allocated_traffic_key)
706                 kfree(traffic_key);
707         return drop;
708 #else
709         return true;
710 #endif
711 }
712
713 /*
714  *      This routine will send an RST to the other tcp.
715  *
716  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
717  *                    for reset.
718  *      Answer: if a packet caused RST, it is not for a socket
719  *              existing in our system, if it is matched to a socket,
720  *              it is just duplicate segment or bug in other side's TCP.
721  *              So that we build reply only basing on parameters
722  *              arrived with segment.
723  *      Exception: precedence violation. We do not implement it in any case.
724  */
725
726 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
727 {
728         const struct tcphdr *th = tcp_hdr(skb);
729         struct {
730                 struct tcphdr th;
731                 __be32 opt[REPLY_OPTIONS_LEN];
732         } rep;
733         const __u8 *md5_hash_location = NULL;
734         const struct tcp_ao_hdr *aoh;
735         struct ip_reply_arg arg;
736 #ifdef CONFIG_TCP_MD5SIG
737         struct tcp_md5sig_key *key = NULL;
738         unsigned char newhash[16];
739         struct sock *sk1 = NULL;
740         int genhash;
741 #endif
742         u64 transmit_time = 0;
743         struct sock *ctl_sk;
744         struct net *net;
745         u32 txhash = 0;
746
747         /* Never send a reset in response to a reset. */
748         if (th->rst)
749                 return;
750
751         /* If sk not NULL, it means we did a successful lookup and incoming
752          * route had to be correct. prequeue might have dropped our dst.
753          */
754         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
755                 return;
756
757         /* Swap the send and the receive. */
758         memset(&rep, 0, sizeof(rep));
759         rep.th.dest   = th->source;
760         rep.th.source = th->dest;
761         rep.th.doff   = sizeof(struct tcphdr) / 4;
762         rep.th.rst    = 1;
763
764         if (th->ack) {
765                 rep.th.seq = th->ack_seq;
766         } else {
767                 rep.th.ack = 1;
768                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
769                                        skb->len - (th->doff << 2));
770         }
771
772         memset(&arg, 0, sizeof(arg));
773         arg.iov[0].iov_base = (unsigned char *)&rep;
774         arg.iov[0].iov_len  = sizeof(rep.th);
775
776         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
777
778         /* Invalid TCP option size or twice included auth */
779         if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
780                 return;
781
782         if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
783                 return;
784
785 #ifdef CONFIG_TCP_MD5SIG
786         rcu_read_lock();
787         if (sk && sk_fullsock(sk)) {
788                 const union tcp_md5_addr *addr;
789                 int l3index;
790
791                 /* sdif set, means packet ingressed via a device
792                  * in an L3 domain and inet_iif is set to it.
793                  */
794                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
795                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
796                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
797         } else if (md5_hash_location) {
798                 const union tcp_md5_addr *addr;
799                 int sdif = tcp_v4_sdif(skb);
800                 int dif = inet_iif(skb);
801                 int l3index;
802
803                 /*
804                  * active side is lost. Try to find listening socket through
805                  * source port, and then find md5 key through listening socket.
806                  * we are not loose security here:
807                  * Incoming packet is checked with md5 hash with finding key,
808                  * no RST generated if md5 hash doesn't match.
809                  */
810                 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
811                                              NULL, 0, ip_hdr(skb)->saddr,
812                                              th->source, ip_hdr(skb)->daddr,
813                                              ntohs(th->source), dif, sdif);
814                 /* don't send rst if it can't find key */
815                 if (!sk1)
816                         goto out;
817
818                 /* sdif set, means packet ingressed via a device
819                  * in an L3 domain and dif is set to it.
820                  */
821                 l3index = sdif ? dif : 0;
822                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
823                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
824                 if (!key)
825                         goto out;
826
827
828                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
829                 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
830                         goto out;
831
832         }
833
834         if (key) {
835                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
836                                    (TCPOPT_NOP << 16) |
837                                    (TCPOPT_MD5SIG << 8) |
838                                    TCPOLEN_MD5SIG);
839                 /* Update length and the length the header thinks exists */
840                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
841                 rep.th.doff = arg.iov[0].iov_len / 4;
842
843                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
844                                      key, ip_hdr(skb)->saddr,
845                                      ip_hdr(skb)->daddr, &rep.th);
846         }
847 #endif
848         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
849         if (rep.opt[0] == 0) {
850                 __be32 mrst = mptcp_reset_option(skb);
851
852                 if (mrst) {
853                         rep.opt[0] = mrst;
854                         arg.iov[0].iov_len += sizeof(mrst);
855                         rep.th.doff = arg.iov[0].iov_len / 4;
856                 }
857         }
858
859         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
860                                       ip_hdr(skb)->saddr, /* XXX */
861                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
862         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
863         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
864
865         /* When socket is gone, all binding information is lost.
866          * routing might fail in this case. No choice here, if we choose to force
867          * input interface, we will misroute in case of asymmetric route.
868          */
869         if (sk) {
870                 arg.bound_dev_if = sk->sk_bound_dev_if;
871                 if (sk_fullsock(sk))
872                         trace_tcp_send_reset(sk, skb);
873         }
874
875         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
876                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
877
878         arg.tos = ip_hdr(skb)->tos;
879         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
880         local_bh_disable();
881         ctl_sk = this_cpu_read(ipv4_tcp_sk);
882         sock_net_set(ctl_sk, net);
883         if (sk) {
884                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
885                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
886                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
887                                    inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
888                 transmit_time = tcp_transmit_time(sk);
889                 xfrm_sk_clone_policy(ctl_sk, sk);
890                 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
891                          inet_twsk(sk)->tw_txhash : sk->sk_txhash;
892         } else {
893                 ctl_sk->sk_mark = 0;
894                 ctl_sk->sk_priority = 0;
895         }
896         ip_send_unicast_reply(ctl_sk,
897                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
898                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
899                               &arg, arg.iov[0].iov_len,
900                               transmit_time, txhash);
901
902         xfrm_sk_free_policy(ctl_sk);
903         sock_net_set(ctl_sk, &init_net);
904         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
905         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
906         local_bh_enable();
907
908 #ifdef CONFIG_TCP_MD5SIG
909 out:
910         rcu_read_unlock();
911 #endif
912 }
913
914 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
915    outside socket context is ugly, certainly. What can I do?
916  */
917
918 static void tcp_v4_send_ack(const struct sock *sk,
919                             struct sk_buff *skb, u32 seq, u32 ack,
920                             u32 win, u32 tsval, u32 tsecr, int oif,
921                             struct tcp_key *key,
922                             int reply_flags, u8 tos, u32 txhash)
923 {
924         const struct tcphdr *th = tcp_hdr(skb);
925         struct {
926                 struct tcphdr th;
927                 __be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
928         } rep;
929         struct net *net = sock_net(sk);
930         struct ip_reply_arg arg;
931         struct sock *ctl_sk;
932         u64 transmit_time;
933
934         memset(&rep.th, 0, sizeof(struct tcphdr));
935         memset(&arg, 0, sizeof(arg));
936
937         arg.iov[0].iov_base = (unsigned char *)&rep;
938         arg.iov[0].iov_len  = sizeof(rep.th);
939         if (tsecr) {
940                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
941                                    (TCPOPT_TIMESTAMP << 8) |
942                                    TCPOLEN_TIMESTAMP);
943                 rep.opt[1] = htonl(tsval);
944                 rep.opt[2] = htonl(tsecr);
945                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
946         }
947
948         /* Swap the send and the receive. */
949         rep.th.dest    = th->source;
950         rep.th.source  = th->dest;
951         rep.th.doff    = arg.iov[0].iov_len / 4;
952         rep.th.seq     = htonl(seq);
953         rep.th.ack_seq = htonl(ack);
954         rep.th.ack     = 1;
955         rep.th.window  = htons(win);
956
957 #ifdef CONFIG_TCP_MD5SIG
958         if (tcp_key_is_md5(key)) {
959                 int offset = (tsecr) ? 3 : 0;
960
961                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
962                                           (TCPOPT_NOP << 16) |
963                                           (TCPOPT_MD5SIG << 8) |
964                                           TCPOLEN_MD5SIG);
965                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
966                 rep.th.doff = arg.iov[0].iov_len/4;
967
968                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
969                                     key->md5_key, ip_hdr(skb)->saddr,
970                                     ip_hdr(skb)->daddr, &rep.th);
971         }
972 #endif
973 #ifdef CONFIG_TCP_AO
974         if (tcp_key_is_ao(key)) {
975                 int offset = (tsecr) ? 3 : 0;
976
977                 rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
978                                           (tcp_ao_len(key->ao_key) << 16) |
979                                           (key->ao_key->sndid << 8) |
980                                           key->rcv_next);
981                 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
982                 rep.th.doff = arg.iov[0].iov_len / 4;
983
984                 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
985                                 key->ao_key, key->traffic_key,
986                                 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
987                                 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
988                                 &rep.th, key->sne);
989         }
990 #endif
991         arg.flags = reply_flags;
992         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
993                                       ip_hdr(skb)->saddr, /* XXX */
994                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
995         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
996         if (oif)
997                 arg.bound_dev_if = oif;
998         arg.tos = tos;
999         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1000         local_bh_disable();
1001         ctl_sk = this_cpu_read(ipv4_tcp_sk);
1002         sock_net_set(ctl_sk, net);
1003         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1004                            inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1005         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1006                            inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1007         transmit_time = tcp_transmit_time(sk);
1008         ip_send_unicast_reply(ctl_sk,
1009                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
1010                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1011                               &arg, arg.iov[0].iov_len,
1012                               transmit_time, txhash);
1013
1014         sock_net_set(ctl_sk, &init_net);
1015         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1016         local_bh_enable();
1017 }
1018
1019 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1020 {
1021         struct inet_timewait_sock *tw = inet_twsk(sk);
1022         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1023         struct tcp_key key = {};
1024 #ifdef CONFIG_TCP_AO
1025         struct tcp_ao_info *ao_info;
1026
1027         if (static_branch_unlikely(&tcp_ao_needed.key)) {
1028                 /* FIXME: the segment to-be-acked is not verified yet */
1029                 ao_info = rcu_dereference(tcptw->ao_info);
1030                 if (ao_info) {
1031                         const struct tcp_ao_hdr *aoh;
1032
1033                         if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1034                                 inet_twsk_put(tw);
1035                                 return;
1036                         }
1037
1038                         if (aoh)
1039                                 key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
1040                 }
1041         }
1042         if (key.ao_key) {
1043                 struct tcp_ao_key *rnext_key;
1044
1045                 key.traffic_key = snd_other_key(key.ao_key);
1046                 key.sne = READ_ONCE(ao_info->snd_sne);
1047                 rnext_key = READ_ONCE(ao_info->rnext_key);
1048                 key.rcv_next = rnext_key->rcvid;
1049                 key.type = TCP_KEY_AO;
1050 #else
1051         if (0) {
1052 #endif
1053 #ifdef CONFIG_TCP_MD5SIG
1054         } else if (static_branch_unlikely(&tcp_md5_needed.key)) {
1055                 key.md5_key = tcp_twsk_md5_key(tcptw);
1056                 if (key.md5_key)
1057                         key.type = TCP_KEY_MD5;
1058 #endif
1059         }
1060
1061         tcp_v4_send_ack(sk, skb,
1062                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
1063                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1064                         tcp_tw_tsval(tcptw),
1065                         tcptw->tw_ts_recent,
1066                         tw->tw_bound_dev_if, &key,
1067                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1068                         tw->tw_tos,
1069                         tw->tw_txhash);
1070
1071         inet_twsk_put(tw);
1072 }
1073
1074 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1075                                   struct request_sock *req)
1076 {
1077         struct tcp_key key = {};
1078
1079         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1080          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1081          */
1082         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1083                                              tcp_sk(sk)->snd_nxt;
1084
1085 #ifdef CONFIG_TCP_AO
1086         if (static_branch_unlikely(&tcp_ao_needed.key) &&
1087             tcp_rsk_used_ao(req)) {
1088                 const union tcp_md5_addr *addr;
1089                 const struct tcp_ao_hdr *aoh;
1090                 int l3index;
1091
1092                 /* Invalid TCP option size or twice included auth */
1093                 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1094                         return;
1095                 if (!aoh)
1096                         return;
1097
1098                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1099                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1100                 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1101                                               aoh->rnext_keyid, -1);
1102                 if (unlikely(!key.ao_key)) {
1103                         /* Send ACK with any matching MKT for the peer */
1104                         key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1105                         /* Matching key disappeared (user removed the key?)
1106                          * let the handshake timeout.
1107                          */
1108                         if (!key.ao_key) {
1109                                 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1110                                                      addr,
1111                                                      ntohs(tcp_hdr(skb)->source),
1112                                                      &ip_hdr(skb)->daddr,
1113                                                      ntohs(tcp_hdr(skb)->dest));
1114                                 return;
1115                         }
1116                 }
1117                 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1118                 if (!key.traffic_key)
1119                         return;
1120
1121                 key.type = TCP_KEY_AO;
1122                 key.rcv_next = aoh->keyid;
1123                 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1124 #else
1125         if (0) {
1126 #endif
1127 #ifdef CONFIG_TCP_MD5SIG
1128         } else if (static_branch_unlikely(&tcp_md5_needed.key)) {
1129                 const union tcp_md5_addr *addr;
1130                 int l3index;
1131
1132                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1133                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1134                 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1135                 if (key.md5_key)
1136                         key.type = TCP_KEY_MD5;
1137 #endif
1138         }
1139
1140         /* RFC 7323 2.3
1141          * The window field (SEG.WND) of every outgoing segment, with the
1142          * exception of <SYN> segments, MUST be right-shifted by
1143          * Rcv.Wind.Shift bits:
1144          */
1145         tcp_v4_send_ack(sk, skb, seq,
1146                         tcp_rsk(req)->rcv_nxt,
1147                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
1148                         tcp_rsk_tsval(tcp_rsk(req)),
1149                         READ_ONCE(req->ts_recent),
1150                         0, &key,
1151                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1152                         ip_hdr(skb)->tos,
1153                         READ_ONCE(tcp_rsk(req)->txhash));
1154         if (tcp_key_is_ao(&key))
1155                 kfree(key.traffic_key);
1156 }
1157
1158 /*
1159  *      Send a SYN-ACK after having received a SYN.
1160  *      This still operates on a request_sock only, not on a big
1161  *      socket.
1162  */
1163 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1164                               struct flowi *fl,
1165                               struct request_sock *req,
1166                               struct tcp_fastopen_cookie *foc,
1167                               enum tcp_synack_type synack_type,
1168                               struct sk_buff *syn_skb)
1169 {
1170         const struct inet_request_sock *ireq = inet_rsk(req);
1171         struct flowi4 fl4;
1172         int err = -1;
1173         struct sk_buff *skb;
1174         u8 tos;
1175
1176         /* First, grab a route. */
1177         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1178                 return -1;
1179
1180         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1181
1182         if (skb) {
1183                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1184
1185                 tos = READ_ONCE(inet_sk(sk)->tos);
1186
1187                 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1188                         tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1189                               (tos & INET_ECN_MASK);
1190
1191                 if (!INET_ECN_is_capable(tos) &&
1192                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1193                         tos |= INET_ECN_ECT_0;
1194
1195                 rcu_read_lock();
1196                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1197                                             ireq->ir_rmt_addr,
1198                                             rcu_dereference(ireq->ireq_opt),
1199                                             tos);
1200                 rcu_read_unlock();
1201                 err = net_xmit_eval(err);
1202         }
1203
1204         return err;
1205 }
1206
1207 /*
1208  *      IPv4 request_sock destructor.
1209  */
1210 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1211 {
1212         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1213 }
1214
1215 #ifdef CONFIG_TCP_MD5SIG
1216 /*
1217  * RFC2385 MD5 checksumming requires a mapping of
1218  * IP address->MD5 Key.
1219  * We need to maintain these in the sk structure.
1220  */
1221
1222 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1223 EXPORT_SYMBOL(tcp_md5_needed);
1224
1225 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1226 {
1227         if (!old)
1228                 return true;
1229
1230         /* l3index always overrides non-l3index */
1231         if (old->l3index && new->l3index == 0)
1232                 return false;
1233         if (old->l3index == 0 && new->l3index)
1234                 return true;
1235
1236         return old->prefixlen < new->prefixlen;
1237 }
1238
1239 /* Find the Key structure for an address.  */
1240 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1241                                            const union tcp_md5_addr *addr,
1242                                            int family, bool any_l3index)
1243 {
1244         const struct tcp_sock *tp = tcp_sk(sk);
1245         struct tcp_md5sig_key *key;
1246         const struct tcp_md5sig_info *md5sig;
1247         __be32 mask;
1248         struct tcp_md5sig_key *best_match = NULL;
1249         bool match;
1250
1251         /* caller either holds rcu_read_lock() or socket lock */
1252         md5sig = rcu_dereference_check(tp->md5sig_info,
1253                                        lockdep_sock_is_held(sk));
1254         if (!md5sig)
1255                 return NULL;
1256
1257         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1258                                  lockdep_sock_is_held(sk)) {
1259                 if (key->family != family)
1260                         continue;
1261                 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1262                     key->l3index != l3index)
1263                         continue;
1264                 if (family == AF_INET) {
1265                         mask = inet_make_mask(key->prefixlen);
1266                         match = (key->addr.a4.s_addr & mask) ==
1267                                 (addr->a4.s_addr & mask);
1268 #if IS_ENABLED(CONFIG_IPV6)
1269                 } else if (family == AF_INET6) {
1270                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1271                                                   key->prefixlen);
1272 #endif
1273                 } else {
1274                         match = false;
1275                 }
1276
1277                 if (match && better_md5_match(best_match, key))
1278                         best_match = key;
1279         }
1280         return best_match;
1281 }
1282 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1283
1284 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1285                                                       const union tcp_md5_addr *addr,
1286                                                       int family, u8 prefixlen,
1287                                                       int l3index, u8 flags)
1288 {
1289         const struct tcp_sock *tp = tcp_sk(sk);
1290         struct tcp_md5sig_key *key;
1291         unsigned int size = sizeof(struct in_addr);
1292         const struct tcp_md5sig_info *md5sig;
1293
1294         /* caller either holds rcu_read_lock() or socket lock */
1295         md5sig = rcu_dereference_check(tp->md5sig_info,
1296                                        lockdep_sock_is_held(sk));
1297         if (!md5sig)
1298                 return NULL;
1299 #if IS_ENABLED(CONFIG_IPV6)
1300         if (family == AF_INET6)
1301                 size = sizeof(struct in6_addr);
1302 #endif
1303         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1304                                  lockdep_sock_is_held(sk)) {
1305                 if (key->family != family)
1306                         continue;
1307                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1308                         continue;
1309                 if (key->l3index != l3index)
1310                         continue;
1311                 if (!memcmp(&key->addr, addr, size) &&
1312                     key->prefixlen == prefixlen)
1313                         return key;
1314         }
1315         return NULL;
1316 }
1317
1318 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1319                                          const struct sock *addr_sk)
1320 {
1321         const union tcp_md5_addr *addr;
1322         int l3index;
1323
1324         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1325                                                  addr_sk->sk_bound_dev_if);
1326         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1327         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1328 }
1329 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1330
1331 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1332 {
1333         struct tcp_sock *tp = tcp_sk(sk);
1334         struct tcp_md5sig_info *md5sig;
1335
1336         md5sig = kmalloc(sizeof(*md5sig), gfp);
1337         if (!md5sig)
1338                 return -ENOMEM;
1339
1340         sk_gso_disable(sk);
1341         INIT_HLIST_HEAD(&md5sig->head);
1342         rcu_assign_pointer(tp->md5sig_info, md5sig);
1343         return 0;
1344 }
1345
1346 /* This can be called on a newly created socket, from other files */
1347 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1348                             int family, u8 prefixlen, int l3index, u8 flags,
1349                             const u8 *newkey, u8 newkeylen, gfp_t gfp)
1350 {
1351         /* Add Key to the list */
1352         struct tcp_md5sig_key *key;
1353         struct tcp_sock *tp = tcp_sk(sk);
1354         struct tcp_md5sig_info *md5sig;
1355
1356         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1357         if (key) {
1358                 /* Pre-existing entry - just update that one.
1359                  * Note that the key might be used concurrently.
1360                  * data_race() is telling kcsan that we do not care of
1361                  * key mismatches, since changing MD5 key on live flows
1362                  * can lead to packet drops.
1363                  */
1364                 data_race(memcpy(key->key, newkey, newkeylen));
1365
1366                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1367                  * Also note that a reader could catch new key->keylen value
1368                  * but old key->key[], this is the reason we use __GFP_ZERO
1369                  * at sock_kmalloc() time below these lines.
1370                  */
1371                 WRITE_ONCE(key->keylen, newkeylen);
1372
1373                 return 0;
1374         }
1375
1376         md5sig = rcu_dereference_protected(tp->md5sig_info,
1377                                            lockdep_sock_is_held(sk));
1378
1379         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1380         if (!key)
1381                 return -ENOMEM;
1382
1383         memcpy(key->key, newkey, newkeylen);
1384         key->keylen = newkeylen;
1385         key->family = family;
1386         key->prefixlen = prefixlen;
1387         key->l3index = l3index;
1388         key->flags = flags;
1389         memcpy(&key->addr, addr,
1390                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1391                                                                  sizeof(struct in_addr));
1392         hlist_add_head_rcu(&key->node, &md5sig->head);
1393         return 0;
1394 }
1395
1396 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1397                    int family, u8 prefixlen, int l3index, u8 flags,
1398                    const u8 *newkey, u8 newkeylen)
1399 {
1400         struct tcp_sock *tp = tcp_sk(sk);
1401
1402         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1403                 if (tcp_md5_alloc_sigpool())
1404                         return -ENOMEM;
1405
1406                 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1407                         tcp_md5_release_sigpool();
1408                         return -ENOMEM;
1409                 }
1410
1411                 if (!static_branch_inc(&tcp_md5_needed.key)) {
1412                         struct tcp_md5sig_info *md5sig;
1413
1414                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1415                         rcu_assign_pointer(tp->md5sig_info, NULL);
1416                         kfree_rcu(md5sig, rcu);
1417                         tcp_md5_release_sigpool();
1418                         return -EUSERS;
1419                 }
1420         }
1421
1422         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1423                                 newkey, newkeylen, GFP_KERNEL);
1424 }
1425 EXPORT_SYMBOL(tcp_md5_do_add);
1426
1427 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1428                      int family, u8 prefixlen, int l3index,
1429                      struct tcp_md5sig_key *key)
1430 {
1431         struct tcp_sock *tp = tcp_sk(sk);
1432
1433         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1434                 tcp_md5_add_sigpool();
1435
1436                 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1437                         tcp_md5_release_sigpool();
1438                         return -ENOMEM;
1439                 }
1440
1441                 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1442                         struct tcp_md5sig_info *md5sig;
1443
1444                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1445                         net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1446                         rcu_assign_pointer(tp->md5sig_info, NULL);
1447                         kfree_rcu(md5sig, rcu);
1448                         tcp_md5_release_sigpool();
1449                         return -EUSERS;
1450                 }
1451         }
1452
1453         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1454                                 key->flags, key->key, key->keylen,
1455                                 sk_gfp_mask(sk, GFP_ATOMIC));
1456 }
1457 EXPORT_SYMBOL(tcp_md5_key_copy);
1458
1459 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1460                    u8 prefixlen, int l3index, u8 flags)
1461 {
1462         struct tcp_md5sig_key *key;
1463
1464         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1465         if (!key)
1466                 return -ENOENT;
1467         hlist_del_rcu(&key->node);
1468         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1469         kfree_rcu(key, rcu);
1470         return 0;
1471 }
1472 EXPORT_SYMBOL(tcp_md5_do_del);
1473
1474 void tcp_clear_md5_list(struct sock *sk)
1475 {
1476         struct tcp_sock *tp = tcp_sk(sk);
1477         struct tcp_md5sig_key *key;
1478         struct hlist_node *n;
1479         struct tcp_md5sig_info *md5sig;
1480
1481         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1482
1483         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1484                 hlist_del_rcu(&key->node);
1485                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1486                 kfree_rcu(key, rcu);
1487         }
1488 }
1489
1490 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1491                                  sockptr_t optval, int optlen)
1492 {
1493         struct tcp_md5sig cmd;
1494         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1495         const union tcp_md5_addr *addr;
1496         u8 prefixlen = 32;
1497         int l3index = 0;
1498         bool l3flag;
1499         u8 flags;
1500
1501         if (optlen < sizeof(cmd))
1502                 return -EINVAL;
1503
1504         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1505                 return -EFAULT;
1506
1507         if (sin->sin_family != AF_INET)
1508                 return -EINVAL;
1509
1510         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1511         l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1512
1513         if (optname == TCP_MD5SIG_EXT &&
1514             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1515                 prefixlen = cmd.tcpm_prefixlen;
1516                 if (prefixlen > 32)
1517                         return -EINVAL;
1518         }
1519
1520         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1521             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1522                 struct net_device *dev;
1523
1524                 rcu_read_lock();
1525                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1526                 if (dev && netif_is_l3_master(dev))
1527                         l3index = dev->ifindex;
1528
1529                 rcu_read_unlock();
1530
1531                 /* ok to reference set/not set outside of rcu;
1532                  * right now device MUST be an L3 master
1533                  */
1534                 if (!dev || !l3index)
1535                         return -EINVAL;
1536         }
1537
1538         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1539
1540         if (!cmd.tcpm_keylen)
1541                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1542
1543         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1544                 return -EINVAL;
1545
1546         /* Don't allow keys for peers that have a matching TCP-AO key.
1547          * See the comment in tcp_ao_add_cmd()
1548          */
1549         if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1550                 return -EKEYREJECTED;
1551
1552         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1553                               cmd.tcpm_key, cmd.tcpm_keylen);
1554 }
1555
1556 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1557                                    __be32 daddr, __be32 saddr,
1558                                    const struct tcphdr *th, int nbytes)
1559 {
1560         struct tcp4_pseudohdr *bp;
1561         struct scatterlist sg;
1562         struct tcphdr *_th;
1563
1564         bp = hp->scratch;
1565         bp->saddr = saddr;
1566         bp->daddr = daddr;
1567         bp->pad = 0;
1568         bp->protocol = IPPROTO_TCP;
1569         bp->len = cpu_to_be16(nbytes);
1570
1571         _th = (struct tcphdr *)(bp + 1);
1572         memcpy(_th, th, sizeof(*th));
1573         _th->check = 0;
1574
1575         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1576         ahash_request_set_crypt(hp->req, &sg, NULL,
1577                                 sizeof(*bp) + sizeof(*th));
1578         return crypto_ahash_update(hp->req);
1579 }
1580
1581 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1582                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1583 {
1584         struct tcp_sigpool hp;
1585
1586         if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1587                 goto clear_hash_nostart;
1588
1589         if (crypto_ahash_init(hp.req))
1590                 goto clear_hash;
1591         if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1592                 goto clear_hash;
1593         if (tcp_md5_hash_key(&hp, key))
1594                 goto clear_hash;
1595         ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1596         if (crypto_ahash_final(hp.req))
1597                 goto clear_hash;
1598
1599         tcp_sigpool_end(&hp);
1600         return 0;
1601
1602 clear_hash:
1603         tcp_sigpool_end(&hp);
1604 clear_hash_nostart:
1605         memset(md5_hash, 0, 16);
1606         return 1;
1607 }
1608
1609 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1610                         const struct sock *sk,
1611                         const struct sk_buff *skb)
1612 {
1613         const struct tcphdr *th = tcp_hdr(skb);
1614         struct tcp_sigpool hp;
1615         __be32 saddr, daddr;
1616
1617         if (sk) { /* valid for establish/request sockets */
1618                 saddr = sk->sk_rcv_saddr;
1619                 daddr = sk->sk_daddr;
1620         } else {
1621                 const struct iphdr *iph = ip_hdr(skb);
1622                 saddr = iph->saddr;
1623                 daddr = iph->daddr;
1624         }
1625
1626         if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1627                 goto clear_hash_nostart;
1628
1629         if (crypto_ahash_init(hp.req))
1630                 goto clear_hash;
1631
1632         if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1633                 goto clear_hash;
1634         if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1635                 goto clear_hash;
1636         if (tcp_md5_hash_key(&hp, key))
1637                 goto clear_hash;
1638         ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1639         if (crypto_ahash_final(hp.req))
1640                 goto clear_hash;
1641
1642         tcp_sigpool_end(&hp);
1643         return 0;
1644
1645 clear_hash:
1646         tcp_sigpool_end(&hp);
1647 clear_hash_nostart:
1648         memset(md5_hash, 0, 16);
1649         return 1;
1650 }
1651 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1652
1653 #endif
1654
1655 static void tcp_v4_init_req(struct request_sock *req,
1656                             const struct sock *sk_listener,
1657                             struct sk_buff *skb)
1658 {
1659         struct inet_request_sock *ireq = inet_rsk(req);
1660         struct net *net = sock_net(sk_listener);
1661
1662         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1663         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1664         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1665 }
1666
1667 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1668                                           struct sk_buff *skb,
1669                                           struct flowi *fl,
1670                                           struct request_sock *req)
1671 {
1672         tcp_v4_init_req(req, sk, skb);
1673
1674         if (security_inet_conn_request(sk, skb, req))
1675                 return NULL;
1676
1677         return inet_csk_route_req(sk, &fl->u.ip4, req);
1678 }
1679
1680 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1681         .family         =       PF_INET,
1682         .obj_size       =       sizeof(struct tcp_request_sock),
1683         .rtx_syn_ack    =       tcp_rtx_synack,
1684         .send_ack       =       tcp_v4_reqsk_send_ack,
1685         .destructor     =       tcp_v4_reqsk_destructor,
1686         .send_reset     =       tcp_v4_send_reset,
1687         .syn_ack_timeout =      tcp_syn_ack_timeout,
1688 };
1689
1690 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1691         .mss_clamp      =       TCP_MSS_DEFAULT,
1692 #ifdef CONFIG_TCP_MD5SIG
1693         .req_md5_lookup =       tcp_v4_md5_lookup,
1694         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1695 #endif
1696 #ifdef CONFIG_TCP_AO
1697         .ao_lookup      =       tcp_v4_ao_lookup_rsk,
1698         .ao_calc_key    =       tcp_v4_ao_calc_key_rsk,
1699         .ao_synack_hash =       tcp_v4_ao_synack_hash,
1700 #endif
1701 #ifdef CONFIG_SYN_COOKIES
1702         .cookie_init_seq =      cookie_v4_init_sequence,
1703 #endif
1704         .route_req      =       tcp_v4_route_req,
1705         .init_seq       =       tcp_v4_init_seq,
1706         .init_ts_off    =       tcp_v4_init_ts_off,
1707         .send_synack    =       tcp_v4_send_synack,
1708 };
1709
1710 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1711 {
1712         /* Never answer to SYNs send to broadcast or multicast */
1713         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1714                 goto drop;
1715
1716         return tcp_conn_request(&tcp_request_sock_ops,
1717                                 &tcp_request_sock_ipv4_ops, sk, skb);
1718
1719 drop:
1720         tcp_listendrop(sk);
1721         return 0;
1722 }
1723 EXPORT_SYMBOL(tcp_v4_conn_request);
1724
1725
1726 /*
1727  * The three way handshake has completed - we got a valid synack -
1728  * now create the new socket.
1729  */
1730 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1731                                   struct request_sock *req,
1732                                   struct dst_entry *dst,
1733                                   struct request_sock *req_unhash,
1734                                   bool *own_req)
1735 {
1736         struct inet_request_sock *ireq;
1737         bool found_dup_sk = false;
1738         struct inet_sock *newinet;
1739         struct tcp_sock *newtp;
1740         struct sock *newsk;
1741 #ifdef CONFIG_TCP_MD5SIG
1742         const union tcp_md5_addr *addr;
1743         struct tcp_md5sig_key *key;
1744         int l3index;
1745 #endif
1746         struct ip_options_rcu *inet_opt;
1747
1748         if (sk_acceptq_is_full(sk))
1749                 goto exit_overflow;
1750
1751         newsk = tcp_create_openreq_child(sk, req, skb);
1752         if (!newsk)
1753                 goto exit_nonewsk;
1754
1755         newsk->sk_gso_type = SKB_GSO_TCPV4;
1756         inet_sk_rx_dst_set(newsk, skb);
1757
1758         newtp                 = tcp_sk(newsk);
1759         newinet               = inet_sk(newsk);
1760         ireq                  = inet_rsk(req);
1761         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1762         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1763         newsk->sk_bound_dev_if = ireq->ir_iif;
1764         newinet->inet_saddr   = ireq->ir_loc_addr;
1765         inet_opt              = rcu_dereference(ireq->ireq_opt);
1766         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1767         newinet->mc_index     = inet_iif(skb);
1768         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1769         newinet->rcv_tos      = ip_hdr(skb)->tos;
1770         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1771         if (inet_opt)
1772                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1773         atomic_set(&newinet->inet_id, get_random_u16());
1774
1775         /* Set ToS of the new socket based upon the value of incoming SYN.
1776          * ECT bits are set later in tcp_init_transfer().
1777          */
1778         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1779                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1780
1781         if (!dst) {
1782                 dst = inet_csk_route_child_sock(sk, newsk, req);
1783                 if (!dst)
1784                         goto put_and_exit;
1785         } else {
1786                 /* syncookie case : see end of cookie_v4_check() */
1787         }
1788         sk_setup_caps(newsk, dst);
1789
1790         tcp_ca_openreq_child(newsk, dst);
1791
1792         tcp_sync_mss(newsk, dst_mtu(dst));
1793         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1794
1795         tcp_initialize_rcv_mss(newsk);
1796
1797 #ifdef CONFIG_TCP_MD5SIG
1798         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1799         /* Copy over the MD5 key from the original socket */
1800         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1801         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1802         if (key && !tcp_rsk_used_ao(req)) {
1803                 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1804                         goto put_and_exit;
1805                 sk_gso_disable(newsk);
1806         }
1807 #endif
1808 #ifdef CONFIG_TCP_AO
1809         if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1810                 goto put_and_exit; /* OOM, release back memory */
1811 #endif
1812
1813         if (__inet_inherit_port(sk, newsk) < 0)
1814                 goto put_and_exit;
1815         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1816                                        &found_dup_sk);
1817         if (likely(*own_req)) {
1818                 tcp_move_syn(newtp, req);
1819                 ireq->ireq_opt = NULL;
1820         } else {
1821                 newinet->inet_opt = NULL;
1822
1823                 if (!req_unhash && found_dup_sk) {
1824                         /* This code path should only be executed in the
1825                          * syncookie case only
1826                          */
1827                         bh_unlock_sock(newsk);
1828                         sock_put(newsk);
1829                         newsk = NULL;
1830                 }
1831         }
1832         return newsk;
1833
1834 exit_overflow:
1835         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1836 exit_nonewsk:
1837         dst_release(dst);
1838 exit:
1839         tcp_listendrop(sk);
1840         return NULL;
1841 put_and_exit:
1842         newinet->inet_opt = NULL;
1843         inet_csk_prepare_forced_close(newsk);
1844         tcp_done(newsk);
1845         goto exit;
1846 }
1847 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1848
1849 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1850 {
1851 #ifdef CONFIG_SYN_COOKIES
1852         const struct tcphdr *th = tcp_hdr(skb);
1853
1854         if (!th->syn)
1855                 sk = cookie_v4_check(sk, skb);
1856 #endif
1857         return sk;
1858 }
1859
1860 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1861                          struct tcphdr *th, u32 *cookie)
1862 {
1863         u16 mss = 0;
1864 #ifdef CONFIG_SYN_COOKIES
1865         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1866                                     &tcp_request_sock_ipv4_ops, sk, th);
1867         if (mss) {
1868                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1869                 tcp_synq_overflow(sk);
1870         }
1871 #endif
1872         return mss;
1873 }
1874
1875 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1876                                                            u32));
1877 /* The socket must have it's spinlock held when we get
1878  * here, unless it is a TCP_LISTEN socket.
1879  *
1880  * We have a potential double-lock case here, so even when
1881  * doing backlog processing we use the BH locking scheme.
1882  * This is because we cannot sleep with the original spinlock
1883  * held.
1884  */
1885 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1886 {
1887         enum skb_drop_reason reason;
1888         struct sock *rsk;
1889
1890         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1891                 struct dst_entry *dst;
1892
1893                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1894                                                 lockdep_sock_is_held(sk));
1895
1896                 sock_rps_save_rxhash(sk, skb);
1897                 sk_mark_napi_id(sk, skb);
1898                 if (dst) {
1899                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1900                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1901                                              dst, 0)) {
1902                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1903                                 dst_release(dst);
1904                         }
1905                 }
1906                 tcp_rcv_established(sk, skb);
1907                 return 0;
1908         }
1909
1910         if (tcp_checksum_complete(skb))
1911                 goto csum_err;
1912
1913         if (sk->sk_state == TCP_LISTEN) {
1914                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1915
1916                 if (!nsk)
1917                         return 0;
1918                 if (nsk != sk) {
1919                         reason = tcp_child_process(sk, nsk, skb);
1920                         if (reason) {
1921                                 rsk = nsk;
1922                                 goto reset;
1923                         }
1924                         return 0;
1925                 }
1926         } else
1927                 sock_rps_save_rxhash(sk, skb);
1928
1929         reason = tcp_rcv_state_process(sk, skb);
1930         if (reason) {
1931                 rsk = sk;
1932                 goto reset;
1933         }
1934         return 0;
1935
1936 reset:
1937         tcp_v4_send_reset(rsk, skb);
1938 discard:
1939         kfree_skb_reason(skb, reason);
1940         /* Be careful here. If this function gets more complicated and
1941          * gcc suffers from register pressure on the x86, sk (in %ebx)
1942          * might be destroyed here. This current version compiles correctly,
1943          * but you have been warned.
1944          */
1945         return 0;
1946
1947 csum_err:
1948         reason = SKB_DROP_REASON_TCP_CSUM;
1949         trace_tcp_bad_csum(skb);
1950         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1951         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1952         goto discard;
1953 }
1954 EXPORT_SYMBOL(tcp_v4_do_rcv);
1955
1956 int tcp_v4_early_demux(struct sk_buff *skb)
1957 {
1958         struct net *net = dev_net(skb->dev);
1959         const struct iphdr *iph;
1960         const struct tcphdr *th;
1961         struct sock *sk;
1962
1963         if (skb->pkt_type != PACKET_HOST)
1964                 return 0;
1965
1966         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1967                 return 0;
1968
1969         iph = ip_hdr(skb);
1970         th = tcp_hdr(skb);
1971
1972         if (th->doff < sizeof(struct tcphdr) / 4)
1973                 return 0;
1974
1975         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1976                                        iph->saddr, th->source,
1977                                        iph->daddr, ntohs(th->dest),
1978                                        skb->skb_iif, inet_sdif(skb));
1979         if (sk) {
1980                 skb->sk = sk;
1981                 skb->destructor = sock_edemux;
1982                 if (sk_fullsock(sk)) {
1983                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1984
1985                         if (dst)
1986                                 dst = dst_check(dst, 0);
1987                         if (dst &&
1988                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1989                                 skb_dst_set_noref(skb, dst);
1990                 }
1991         }
1992         return 0;
1993 }
1994
1995 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1996                      enum skb_drop_reason *reason)
1997 {
1998         u32 limit, tail_gso_size, tail_gso_segs;
1999         struct skb_shared_info *shinfo;
2000         const struct tcphdr *th;
2001         struct tcphdr *thtail;
2002         struct sk_buff *tail;
2003         unsigned int hdrlen;
2004         bool fragstolen;
2005         u32 gso_segs;
2006         u32 gso_size;
2007         int delta;
2008
2009         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2010          * we can fix skb->truesize to its real value to avoid future drops.
2011          * This is valid because skb is not yet charged to the socket.
2012          * It has been noticed pure SACK packets were sometimes dropped
2013          * (if cooked by drivers without copybreak feature).
2014          */
2015         skb_condense(skb);
2016
2017         skb_dst_drop(skb);
2018
2019         if (unlikely(tcp_checksum_complete(skb))) {
2020                 bh_unlock_sock(sk);
2021                 trace_tcp_bad_csum(skb);
2022                 *reason = SKB_DROP_REASON_TCP_CSUM;
2023                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2024                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2025                 return true;
2026         }
2027
2028         /* Attempt coalescing to last skb in backlog, even if we are
2029          * above the limits.
2030          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2031          */
2032         th = (const struct tcphdr *)skb->data;
2033         hdrlen = th->doff * 4;
2034
2035         tail = sk->sk_backlog.tail;
2036         if (!tail)
2037                 goto no_coalesce;
2038         thtail = (struct tcphdr *)tail->data;
2039
2040         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2041             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2042             ((TCP_SKB_CB(tail)->tcp_flags |
2043               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2044             !((TCP_SKB_CB(tail)->tcp_flags &
2045               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2046             ((TCP_SKB_CB(tail)->tcp_flags ^
2047               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2048 #ifdef CONFIG_TLS_DEVICE
2049             tail->decrypted != skb->decrypted ||
2050 #endif
2051             !mptcp_skb_can_collapse(tail, skb) ||
2052             thtail->doff != th->doff ||
2053             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2054                 goto no_coalesce;
2055
2056         __skb_pull(skb, hdrlen);
2057
2058         shinfo = skb_shinfo(skb);
2059         gso_size = shinfo->gso_size ?: skb->len;
2060         gso_segs = shinfo->gso_segs ?: 1;
2061
2062         shinfo = skb_shinfo(tail);
2063         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2064         tail_gso_segs = shinfo->gso_segs ?: 1;
2065
2066         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2067                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2068
2069                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2070                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2071                         thtail->window = th->window;
2072                 }
2073
2074                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2075                  * thtail->fin, so that the fast path in tcp_rcv_established()
2076                  * is not entered if we append a packet with a FIN.
2077                  * SYN, RST, URG are not present.
2078                  * ACK is set on both packets.
2079                  * PSH : we do not really care in TCP stack,
2080                  *       at least for 'GRO' packets.
2081                  */
2082                 thtail->fin |= th->fin;
2083                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2084
2085                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2086                         TCP_SKB_CB(tail)->has_rxtstamp = true;
2087                         tail->tstamp = skb->tstamp;
2088                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2089                 }
2090
2091                 /* Not as strict as GRO. We only need to carry mss max value */
2092                 shinfo->gso_size = max(gso_size, tail_gso_size);
2093                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2094
2095                 sk->sk_backlog.len += delta;
2096                 __NET_INC_STATS(sock_net(sk),
2097                                 LINUX_MIB_TCPBACKLOGCOALESCE);
2098                 kfree_skb_partial(skb, fragstolen);
2099                 return false;
2100         }
2101         __skb_push(skb, hdrlen);
2102
2103 no_coalesce:
2104         limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
2105
2106         /* Only socket owner can try to collapse/prune rx queues
2107          * to reduce memory overhead, so add a little headroom here.
2108          * Few sockets backlog are possibly concurrently non empty.
2109          */
2110         limit += 64 * 1024;
2111
2112         if (unlikely(sk_add_backlog(sk, skb, limit))) {
2113                 bh_unlock_sock(sk);
2114                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2115                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2116                 return true;
2117         }
2118         return false;
2119 }
2120 EXPORT_SYMBOL(tcp_add_backlog);
2121
2122 int tcp_filter(struct sock *sk, struct sk_buff *skb)
2123 {
2124         struct tcphdr *th = (struct tcphdr *)skb->data;
2125
2126         return sk_filter_trim_cap(sk, skb, th->doff * 4);
2127 }
2128 EXPORT_SYMBOL(tcp_filter);
2129
2130 static void tcp_v4_restore_cb(struct sk_buff *skb)
2131 {
2132         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2133                 sizeof(struct inet_skb_parm));
2134 }
2135
2136 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2137                            const struct tcphdr *th)
2138 {
2139         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2140          * barrier() makes sure compiler wont play fool^Waliasing games.
2141          */
2142         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2143                 sizeof(struct inet_skb_parm));
2144         barrier();
2145
2146         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2147         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2148                                     skb->len - th->doff * 4);
2149         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2150         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2151         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
2152         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2153         TCP_SKB_CB(skb)->sacked  = 0;
2154         TCP_SKB_CB(skb)->has_rxtstamp =
2155                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2156 }
2157
2158 /*
2159  *      From tcp_input.c
2160  */
2161
2162 int tcp_v4_rcv(struct sk_buff *skb)
2163 {
2164         struct net *net = dev_net(skb->dev);
2165         enum skb_drop_reason drop_reason;
2166         int sdif = inet_sdif(skb);
2167         int dif = inet_iif(skb);
2168         const struct iphdr *iph;
2169         const struct tcphdr *th;
2170         bool refcounted;
2171         struct sock *sk;
2172         int ret;
2173
2174         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2175         if (skb->pkt_type != PACKET_HOST)
2176                 goto discard_it;
2177
2178         /* Count it even if it's bad */
2179         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2180
2181         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2182                 goto discard_it;
2183
2184         th = (const struct tcphdr *)skb->data;
2185
2186         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2187                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2188                 goto bad_packet;
2189         }
2190         if (!pskb_may_pull(skb, th->doff * 4))
2191                 goto discard_it;
2192
2193         /* An explanation is required here, I think.
2194          * Packet length and doff are validated by header prediction,
2195          * provided case of th->doff==0 is eliminated.
2196          * So, we defer the checks. */
2197
2198         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2199                 goto csum_error;
2200
2201         th = (const struct tcphdr *)skb->data;
2202         iph = ip_hdr(skb);
2203 lookup:
2204         sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2205                                skb, __tcp_hdrlen(th), th->source,
2206                                th->dest, sdif, &refcounted);
2207         if (!sk)
2208                 goto no_tcp_socket;
2209
2210 process:
2211         if (sk->sk_state == TCP_TIME_WAIT)
2212                 goto do_time_wait;
2213
2214         if (sk->sk_state == TCP_NEW_SYN_RECV) {
2215                 struct request_sock *req = inet_reqsk(sk);
2216                 bool req_stolen = false;
2217                 struct sock *nsk;
2218
2219                 sk = req->rsk_listener;
2220                 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2221                         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2222                 else
2223                         drop_reason = tcp_inbound_hash(sk, req, skb,
2224                                                        &iph->saddr, &iph->daddr,
2225                                                        AF_INET, dif, sdif);
2226                 if (unlikely(drop_reason)) {
2227                         sk_drops_add(sk, skb);
2228                         reqsk_put(req);
2229                         goto discard_it;
2230                 }
2231                 if (tcp_checksum_complete(skb)) {
2232                         reqsk_put(req);
2233                         goto csum_error;
2234                 }
2235                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2236                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2237                         if (!nsk) {
2238                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2239                                 goto lookup;
2240                         }
2241                         sk = nsk;
2242                         /* reuseport_migrate_sock() has already held one sk_refcnt
2243                          * before returning.
2244                          */
2245                 } else {
2246                         /* We own a reference on the listener, increase it again
2247                          * as we might lose it too soon.
2248                          */
2249                         sock_hold(sk);
2250                 }
2251                 refcounted = true;
2252                 nsk = NULL;
2253                 if (!tcp_filter(sk, skb)) {
2254                         th = (const struct tcphdr *)skb->data;
2255                         iph = ip_hdr(skb);
2256                         tcp_v4_fill_cb(skb, iph, th);
2257                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2258                 } else {
2259                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2260                 }
2261                 if (!nsk) {
2262                         reqsk_put(req);
2263                         if (req_stolen) {
2264                                 /* Another cpu got exclusive access to req
2265                                  * and created a full blown socket.
2266                                  * Try to feed this packet to this socket
2267                                  * instead of discarding it.
2268                                  */
2269                                 tcp_v4_restore_cb(skb);
2270                                 sock_put(sk);
2271                                 goto lookup;
2272                         }
2273                         goto discard_and_relse;
2274                 }
2275                 nf_reset_ct(skb);
2276                 if (nsk == sk) {
2277                         reqsk_put(req);
2278                         tcp_v4_restore_cb(skb);
2279                 } else {
2280                         drop_reason = tcp_child_process(sk, nsk, skb);
2281                         if (drop_reason) {
2282                                 tcp_v4_send_reset(nsk, skb);
2283                                 goto discard_and_relse;
2284                         }
2285                         sock_put(sk);
2286                         return 0;
2287                 }
2288         }
2289
2290         if (static_branch_unlikely(&ip4_min_ttl)) {
2291                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2292                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2293                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2294                         drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2295                         goto discard_and_relse;
2296                 }
2297         }
2298
2299         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2300                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2301                 goto discard_and_relse;
2302         }
2303
2304         drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2305                                        AF_INET, dif, sdif);
2306         if (drop_reason)
2307                 goto discard_and_relse;
2308
2309         nf_reset_ct(skb);
2310
2311         if (tcp_filter(sk, skb)) {
2312                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2313                 goto discard_and_relse;
2314         }
2315         th = (const struct tcphdr *)skb->data;
2316         iph = ip_hdr(skb);
2317         tcp_v4_fill_cb(skb, iph, th);
2318
2319         skb->dev = NULL;
2320
2321         if (sk->sk_state == TCP_LISTEN) {
2322                 ret = tcp_v4_do_rcv(sk, skb);
2323                 goto put_and_return;
2324         }
2325
2326         sk_incoming_cpu_update(sk);
2327
2328         bh_lock_sock_nested(sk);
2329         tcp_segs_in(tcp_sk(sk), skb);
2330         ret = 0;
2331         if (!sock_owned_by_user(sk)) {
2332                 ret = tcp_v4_do_rcv(sk, skb);
2333         } else {
2334                 if (tcp_add_backlog(sk, skb, &drop_reason))
2335                         goto discard_and_relse;
2336         }
2337         bh_unlock_sock(sk);
2338
2339 put_and_return:
2340         if (refcounted)
2341                 sock_put(sk);
2342
2343         return ret;
2344
2345 no_tcp_socket:
2346         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2347         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2348                 goto discard_it;
2349
2350         tcp_v4_fill_cb(skb, iph, th);
2351
2352         if (tcp_checksum_complete(skb)) {
2353 csum_error:
2354                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2355                 trace_tcp_bad_csum(skb);
2356                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2357 bad_packet:
2358                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2359         } else {
2360                 tcp_v4_send_reset(NULL, skb);
2361         }
2362
2363 discard_it:
2364         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2365         /* Discard frame. */
2366         kfree_skb_reason(skb, drop_reason);
2367         return 0;
2368
2369 discard_and_relse:
2370         sk_drops_add(sk, skb);
2371         if (refcounted)
2372                 sock_put(sk);
2373         goto discard_it;
2374
2375 do_time_wait:
2376         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2377                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2378                 inet_twsk_put(inet_twsk(sk));
2379                 goto discard_it;
2380         }
2381
2382         tcp_v4_fill_cb(skb, iph, th);
2383
2384         if (tcp_checksum_complete(skb)) {
2385                 inet_twsk_put(inet_twsk(sk));
2386                 goto csum_error;
2387         }
2388         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2389         case TCP_TW_SYN: {
2390                 struct sock *sk2 = inet_lookup_listener(net,
2391                                                         net->ipv4.tcp_death_row.hashinfo,
2392                                                         skb, __tcp_hdrlen(th),
2393                                                         iph->saddr, th->source,
2394                                                         iph->daddr, th->dest,
2395                                                         inet_iif(skb),
2396                                                         sdif);
2397                 if (sk2) {
2398                         inet_twsk_deschedule_put(inet_twsk(sk));
2399                         sk = sk2;
2400                         tcp_v4_restore_cb(skb);
2401                         refcounted = false;
2402                         goto process;
2403                 }
2404         }
2405                 /* to ACK */
2406                 fallthrough;
2407         case TCP_TW_ACK:
2408                 tcp_v4_timewait_ack(sk, skb);
2409                 break;
2410         case TCP_TW_RST:
2411                 tcp_v4_send_reset(sk, skb);
2412                 inet_twsk_deschedule_put(inet_twsk(sk));
2413                 goto discard_it;
2414         case TCP_TW_SUCCESS:;
2415         }
2416         goto discard_it;
2417 }
2418
2419 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2420         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2421         .twsk_unique    = tcp_twsk_unique,
2422         .twsk_destructor= tcp_twsk_destructor,
2423 };
2424
2425 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2426 {
2427         struct dst_entry *dst = skb_dst(skb);
2428
2429         if (dst && dst_hold_safe(dst)) {
2430                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2431                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2432         }
2433 }
2434 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2435
2436 const struct inet_connection_sock_af_ops ipv4_specific = {
2437         .queue_xmit        = ip_queue_xmit,
2438         .send_check        = tcp_v4_send_check,
2439         .rebuild_header    = inet_sk_rebuild_header,
2440         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2441         .conn_request      = tcp_v4_conn_request,
2442         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2443         .net_header_len    = sizeof(struct iphdr),
2444         .setsockopt        = ip_setsockopt,
2445         .getsockopt        = ip_getsockopt,
2446         .addr2sockaddr     = inet_csk_addr2sockaddr,
2447         .sockaddr_len      = sizeof(struct sockaddr_in),
2448         .mtu_reduced       = tcp_v4_mtu_reduced,
2449 };
2450 EXPORT_SYMBOL(ipv4_specific);
2451
2452 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2453 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2454 #ifdef CONFIG_TCP_MD5SIG
2455         .md5_lookup             = tcp_v4_md5_lookup,
2456         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2457         .md5_parse              = tcp_v4_parse_md5_keys,
2458 #endif
2459 #ifdef CONFIG_TCP_AO
2460         .ao_lookup              = tcp_v4_ao_lookup,
2461         .calc_ao_hash           = tcp_v4_ao_hash_skb,
2462         .ao_parse               = tcp_v4_parse_ao,
2463         .ao_calc_key_sk         = tcp_v4_ao_calc_key_sk,
2464 #endif
2465 };
2466 #endif
2467
2468 /* NOTE: A lot of things set to zero explicitly by call to
2469  *       sk_alloc() so need not be done here.
2470  */
2471 static int tcp_v4_init_sock(struct sock *sk)
2472 {
2473         struct inet_connection_sock *icsk = inet_csk(sk);
2474
2475         tcp_init_sock(sk);
2476
2477         icsk->icsk_af_ops = &ipv4_specific;
2478
2479 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2480         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2481 #endif
2482
2483         return 0;
2484 }
2485
2486 #ifdef CONFIG_TCP_MD5SIG
2487 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2488 {
2489         struct tcp_md5sig_info *md5sig;
2490
2491         md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2492         kfree(md5sig);
2493         static_branch_slow_dec_deferred(&tcp_md5_needed);
2494         tcp_md5_release_sigpool();
2495 }
2496 #endif
2497
2498 void tcp_v4_destroy_sock(struct sock *sk)
2499 {
2500         struct tcp_sock *tp = tcp_sk(sk);
2501
2502         trace_tcp_destroy_sock(sk);
2503
2504         tcp_clear_xmit_timers(sk);
2505
2506         tcp_cleanup_congestion_control(sk);
2507
2508         tcp_cleanup_ulp(sk);
2509
2510         /* Cleanup up the write buffer. */
2511         tcp_write_queue_purge(sk);
2512
2513         /* Check if we want to disable active TFO */
2514         tcp_fastopen_active_disable_ofo_check(sk);
2515
2516         /* Cleans up our, hopefully empty, out_of_order_queue. */
2517         skb_rbtree_purge(&tp->out_of_order_queue);
2518
2519 #ifdef CONFIG_TCP_MD5SIG
2520         /* Clean up the MD5 key list, if any */
2521         if (tp->md5sig_info) {
2522                 struct tcp_md5sig_info *md5sig;
2523
2524                 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2525                 tcp_clear_md5_list(sk);
2526                 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2527                 rcu_assign_pointer(tp->md5sig_info, NULL);
2528         }
2529 #endif
2530         tcp_ao_destroy_sock(sk, false);
2531
2532         /* Clean up a referenced TCP bind bucket. */
2533         if (inet_csk(sk)->icsk_bind_hash)
2534                 inet_put_port(sk);
2535
2536         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2537
2538         /* If socket is aborted during connect operation */
2539         tcp_free_fastopen_req(tp);
2540         tcp_fastopen_destroy_cipher(sk);
2541         tcp_saved_syn_free(tp);
2542
2543         sk_sockets_allocated_dec(sk);
2544 }
2545 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2546
2547 #ifdef CONFIG_PROC_FS
2548 /* Proc filesystem TCP sock list dumping. */
2549
2550 static unsigned short seq_file_family(const struct seq_file *seq);
2551
2552 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2553 {
2554         unsigned short family = seq_file_family(seq);
2555
2556         /* AF_UNSPEC is used as a match all */
2557         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2558                 net_eq(sock_net(sk), seq_file_net(seq)));
2559 }
2560
2561 /* Find a non empty bucket (starting from st->bucket)
2562  * and return the first sk from it.
2563  */
2564 static void *listening_get_first(struct seq_file *seq)
2565 {
2566         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2567         struct tcp_iter_state *st = seq->private;
2568
2569         st->offset = 0;
2570         for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2571                 struct inet_listen_hashbucket *ilb2;
2572                 struct hlist_nulls_node *node;
2573                 struct sock *sk;
2574
2575                 ilb2 = &hinfo->lhash2[st->bucket];
2576                 if (hlist_nulls_empty(&ilb2->nulls_head))
2577                         continue;
2578
2579                 spin_lock(&ilb2->lock);
2580                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2581                         if (seq_sk_match(seq, sk))
2582                                 return sk;
2583                 }
2584                 spin_unlock(&ilb2->lock);
2585         }
2586
2587         return NULL;
2588 }
2589
2590 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2591  * If "cur" is the last one in the st->bucket,
2592  * call listening_get_first() to return the first sk of the next
2593  * non empty bucket.
2594  */
2595 static void *listening_get_next(struct seq_file *seq, void *cur)
2596 {
2597         struct tcp_iter_state *st = seq->private;
2598         struct inet_listen_hashbucket *ilb2;
2599         struct hlist_nulls_node *node;
2600         struct inet_hashinfo *hinfo;
2601         struct sock *sk = cur;
2602
2603         ++st->num;
2604         ++st->offset;
2605
2606         sk = sk_nulls_next(sk);
2607         sk_nulls_for_each_from(sk, node) {
2608                 if (seq_sk_match(seq, sk))
2609                         return sk;
2610         }
2611
2612         hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2613         ilb2 = &hinfo->lhash2[st->bucket];
2614         spin_unlock(&ilb2->lock);
2615         ++st->bucket;
2616         return listening_get_first(seq);
2617 }
2618
2619 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2620 {
2621         struct tcp_iter_state *st = seq->private;
2622         void *rc;
2623
2624         st->bucket = 0;
2625         st->offset = 0;
2626         rc = listening_get_first(seq);
2627
2628         while (rc && *pos) {
2629                 rc = listening_get_next(seq, rc);
2630                 --*pos;
2631         }
2632         return rc;
2633 }
2634
2635 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2636                                 const struct tcp_iter_state *st)
2637 {
2638         return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2639 }
2640
2641 /*
2642  * Get first established socket starting from bucket given in st->bucket.
2643  * If st->bucket is zero, the very first socket in the hash is returned.
2644  */
2645 static void *established_get_first(struct seq_file *seq)
2646 {
2647         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2648         struct tcp_iter_state *st = seq->private;
2649
2650         st->offset = 0;
2651         for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2652                 struct sock *sk;
2653                 struct hlist_nulls_node *node;
2654                 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2655
2656                 cond_resched();
2657
2658                 /* Lockless fast path for the common case of empty buckets */
2659                 if (empty_bucket(hinfo, st))
2660                         continue;
2661
2662                 spin_lock_bh(lock);
2663                 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2664                         if (seq_sk_match(seq, sk))
2665                                 return sk;
2666                 }
2667                 spin_unlock_bh(lock);
2668         }
2669
2670         return NULL;
2671 }
2672
2673 static void *established_get_next(struct seq_file *seq, void *cur)
2674 {
2675         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2676         struct tcp_iter_state *st = seq->private;
2677         struct hlist_nulls_node *node;
2678         struct sock *sk = cur;
2679
2680         ++st->num;
2681         ++st->offset;
2682
2683         sk = sk_nulls_next(sk);
2684
2685         sk_nulls_for_each_from(sk, node) {
2686                 if (seq_sk_match(seq, sk))
2687                         return sk;
2688         }
2689
2690         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2691         ++st->bucket;
2692         return established_get_first(seq);
2693 }
2694
2695 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2696 {
2697         struct tcp_iter_state *st = seq->private;
2698         void *rc;
2699
2700         st->bucket = 0;
2701         rc = established_get_first(seq);
2702
2703         while (rc && pos) {
2704                 rc = established_get_next(seq, rc);
2705                 --pos;
2706         }
2707         return rc;
2708 }
2709
2710 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2711 {
2712         void *rc;
2713         struct tcp_iter_state *st = seq->private;
2714
2715         st->state = TCP_SEQ_STATE_LISTENING;
2716         rc        = listening_get_idx(seq, &pos);
2717
2718         if (!rc) {
2719                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2720                 rc        = established_get_idx(seq, pos);
2721         }
2722
2723         return rc;
2724 }
2725
2726 static void *tcp_seek_last_pos(struct seq_file *seq)
2727 {
2728         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2729         struct tcp_iter_state *st = seq->private;
2730         int bucket = st->bucket;
2731         int offset = st->offset;
2732         int orig_num = st->num;
2733         void *rc = NULL;
2734
2735         switch (st->state) {
2736         case TCP_SEQ_STATE_LISTENING:
2737                 if (st->bucket > hinfo->lhash2_mask)
2738                         break;
2739                 rc = listening_get_first(seq);
2740                 while (offset-- && rc && bucket == st->bucket)
2741                         rc = listening_get_next(seq, rc);
2742                 if (rc)
2743                         break;
2744                 st->bucket = 0;
2745                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2746                 fallthrough;
2747         case TCP_SEQ_STATE_ESTABLISHED:
2748                 if (st->bucket > hinfo->ehash_mask)
2749                         break;
2750                 rc = established_get_first(seq);
2751                 while (offset-- && rc && bucket == st->bucket)
2752                         rc = established_get_next(seq, rc);
2753         }
2754
2755         st->num = orig_num;
2756
2757         return rc;
2758 }
2759
2760 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2761 {
2762         struct tcp_iter_state *st = seq->private;
2763         void *rc;
2764
2765         if (*pos && *pos == st->last_pos) {
2766                 rc = tcp_seek_last_pos(seq);
2767                 if (rc)
2768                         goto out;
2769         }
2770
2771         st->state = TCP_SEQ_STATE_LISTENING;
2772         st->num = 0;
2773         st->bucket = 0;
2774         st->offset = 0;
2775         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2776
2777 out:
2778         st->last_pos = *pos;
2779         return rc;
2780 }
2781 EXPORT_SYMBOL(tcp_seq_start);
2782
2783 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2784 {
2785         struct tcp_iter_state *st = seq->private;
2786         void *rc = NULL;
2787
2788         if (v == SEQ_START_TOKEN) {
2789                 rc = tcp_get_idx(seq, 0);
2790                 goto out;
2791         }
2792
2793         switch (st->state) {
2794         case TCP_SEQ_STATE_LISTENING:
2795                 rc = listening_get_next(seq, v);
2796                 if (!rc) {
2797                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2798                         st->bucket = 0;
2799                         st->offset = 0;
2800                         rc        = established_get_first(seq);
2801                 }
2802                 break;
2803         case TCP_SEQ_STATE_ESTABLISHED:
2804                 rc = established_get_next(seq, v);
2805                 break;
2806         }
2807 out:
2808         ++*pos;
2809         st->last_pos = *pos;
2810         return rc;
2811 }
2812 EXPORT_SYMBOL(tcp_seq_next);
2813
2814 void tcp_seq_stop(struct seq_file *seq, void *v)
2815 {
2816         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2817         struct tcp_iter_state *st = seq->private;
2818
2819         switch (st->state) {
2820         case TCP_SEQ_STATE_LISTENING:
2821                 if (v != SEQ_START_TOKEN)
2822                         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2823                 break;
2824         case TCP_SEQ_STATE_ESTABLISHED:
2825                 if (v)
2826                         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2827                 break;
2828         }
2829 }
2830 EXPORT_SYMBOL(tcp_seq_stop);
2831
2832 static void get_openreq4(const struct request_sock *req,
2833                          struct seq_file *f, int i)
2834 {
2835         const struct inet_request_sock *ireq = inet_rsk(req);
2836         long delta = req->rsk_timer.expires - jiffies;
2837
2838         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2839                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2840                 i,
2841                 ireq->ir_loc_addr,
2842                 ireq->ir_num,
2843                 ireq->ir_rmt_addr,
2844                 ntohs(ireq->ir_rmt_port),
2845                 TCP_SYN_RECV,
2846                 0, 0, /* could print option size, but that is af dependent. */
2847                 1,    /* timers active (only the expire timer) */
2848                 jiffies_delta_to_clock_t(delta),
2849                 req->num_timeout,
2850                 from_kuid_munged(seq_user_ns(f),
2851                                  sock_i_uid(req->rsk_listener)),
2852                 0,  /* non standard timer */
2853                 0, /* open_requests have no inode */
2854                 0,
2855                 req);
2856 }
2857
2858 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2859 {
2860         int timer_active;
2861         unsigned long timer_expires;
2862         const struct tcp_sock *tp = tcp_sk(sk);
2863         const struct inet_connection_sock *icsk = inet_csk(sk);
2864         const struct inet_sock *inet = inet_sk(sk);
2865         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2866         __be32 dest = inet->inet_daddr;
2867         __be32 src = inet->inet_rcv_saddr;
2868         __u16 destp = ntohs(inet->inet_dport);
2869         __u16 srcp = ntohs(inet->inet_sport);
2870         int rx_queue;
2871         int state;
2872
2873         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2874             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2875             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2876                 timer_active    = 1;
2877                 timer_expires   = icsk->icsk_timeout;
2878         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2879                 timer_active    = 4;
2880                 timer_expires   = icsk->icsk_timeout;
2881         } else if (timer_pending(&sk->sk_timer)) {
2882                 timer_active    = 2;
2883                 timer_expires   = sk->sk_timer.expires;
2884         } else {
2885                 timer_active    = 0;
2886                 timer_expires = jiffies;
2887         }
2888
2889         state = inet_sk_state_load(sk);
2890         if (state == TCP_LISTEN)
2891                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2892         else
2893                 /* Because we don't lock the socket,
2894                  * we might find a transient negative value.
2895                  */
2896                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2897                                       READ_ONCE(tp->copied_seq), 0);
2898
2899         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2900                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2901                 i, src, srcp, dest, destp, state,
2902                 READ_ONCE(tp->write_seq) - tp->snd_una,
2903                 rx_queue,
2904                 timer_active,
2905                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2906                 icsk->icsk_retransmits,
2907                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2908                 icsk->icsk_probes_out,
2909                 sock_i_ino(sk),
2910                 refcount_read(&sk->sk_refcnt), sk,
2911                 jiffies_to_clock_t(icsk->icsk_rto),
2912                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2913                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2914                 tcp_snd_cwnd(tp),
2915                 state == TCP_LISTEN ?
2916                     fastopenq->max_qlen :
2917                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2918 }
2919
2920 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2921                                struct seq_file *f, int i)
2922 {
2923         long delta = tw->tw_timer.expires - jiffies;
2924         __be32 dest, src;
2925         __u16 destp, srcp;
2926
2927         dest  = tw->tw_daddr;
2928         src   = tw->tw_rcv_saddr;
2929         destp = ntohs(tw->tw_dport);
2930         srcp  = ntohs(tw->tw_sport);
2931
2932         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2933                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2934                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2935                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2936                 refcount_read(&tw->tw_refcnt), tw);
2937 }
2938
2939 #define TMPSZ 150
2940
2941 static int tcp4_seq_show(struct seq_file *seq, void *v)
2942 {
2943         struct tcp_iter_state *st;
2944         struct sock *sk = v;
2945
2946         seq_setwidth(seq, TMPSZ - 1);
2947         if (v == SEQ_START_TOKEN) {
2948                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2949                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2950                            "inode");
2951                 goto out;
2952         }
2953         st = seq->private;
2954
2955         if (sk->sk_state == TCP_TIME_WAIT)
2956                 get_timewait4_sock(v, seq, st->num);
2957         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2958                 get_openreq4(v, seq, st->num);
2959         else
2960                 get_tcp4_sock(v, seq, st->num);
2961 out:
2962         seq_pad(seq, '\n');
2963         return 0;
2964 }
2965
2966 #ifdef CONFIG_BPF_SYSCALL
2967 struct bpf_tcp_iter_state {
2968         struct tcp_iter_state state;
2969         unsigned int cur_sk;
2970         unsigned int end_sk;
2971         unsigned int max_sk;
2972         struct sock **batch;
2973         bool st_bucket_done;
2974 };
2975
2976 struct bpf_iter__tcp {
2977         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2978         __bpf_md_ptr(struct sock_common *, sk_common);
2979         uid_t uid __aligned(8);
2980 };
2981
2982 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2983                              struct sock_common *sk_common, uid_t uid)
2984 {
2985         struct bpf_iter__tcp ctx;
2986
2987         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2988         ctx.meta = meta;
2989         ctx.sk_common = sk_common;
2990         ctx.uid = uid;
2991         return bpf_iter_run_prog(prog, &ctx);
2992 }
2993
2994 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2995 {
2996         while (iter->cur_sk < iter->end_sk)
2997                 sock_gen_put(iter->batch[iter->cur_sk++]);
2998 }
2999
3000 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3001                                       unsigned int new_batch_sz)
3002 {
3003         struct sock **new_batch;
3004
3005         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3006                              GFP_USER | __GFP_NOWARN);
3007         if (!new_batch)
3008                 return -ENOMEM;
3009
3010         bpf_iter_tcp_put_batch(iter);
3011         kvfree(iter->batch);
3012         iter->batch = new_batch;
3013         iter->max_sk = new_batch_sz;
3014
3015         return 0;
3016 }
3017
3018 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3019                                                  struct sock *start_sk)
3020 {
3021         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3022         struct bpf_tcp_iter_state *iter = seq->private;
3023         struct tcp_iter_state *st = &iter->state;
3024         struct hlist_nulls_node *node;
3025         unsigned int expected = 1;
3026         struct sock *sk;
3027
3028         sock_hold(start_sk);
3029         iter->batch[iter->end_sk++] = start_sk;
3030
3031         sk = sk_nulls_next(start_sk);
3032         sk_nulls_for_each_from(sk, node) {
3033                 if (seq_sk_match(seq, sk)) {
3034                         if (iter->end_sk < iter->max_sk) {
3035                                 sock_hold(sk);
3036                                 iter->batch[iter->end_sk++] = sk;
3037                         }
3038                         expected++;
3039                 }
3040         }
3041         spin_unlock(&hinfo->lhash2[st->bucket].lock);
3042
3043         return expected;
3044 }
3045
3046 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3047                                                    struct sock *start_sk)
3048 {
3049         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3050         struct bpf_tcp_iter_state *iter = seq->private;
3051         struct tcp_iter_state *st = &iter->state;
3052         struct hlist_nulls_node *node;
3053         unsigned int expected = 1;
3054         struct sock *sk;
3055
3056         sock_hold(start_sk);
3057         iter->batch[iter->end_sk++] = start_sk;
3058
3059         sk = sk_nulls_next(start_sk);
3060         sk_nulls_for_each_from(sk, node) {
3061                 if (seq_sk_match(seq, sk)) {
3062                         if (iter->end_sk < iter->max_sk) {
3063                                 sock_hold(sk);
3064                                 iter->batch[iter->end_sk++] = sk;
3065                         }
3066                         expected++;
3067                 }
3068         }
3069         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3070
3071         return expected;
3072 }
3073
3074 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3075 {
3076         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3077         struct bpf_tcp_iter_state *iter = seq->private;
3078         struct tcp_iter_state *st = &iter->state;
3079         unsigned int expected;
3080         bool resized = false;
3081         struct sock *sk;
3082
3083         /* The st->bucket is done.  Directly advance to the next
3084          * bucket instead of having the tcp_seek_last_pos() to skip
3085          * one by one in the current bucket and eventually find out
3086          * it has to advance to the next bucket.
3087          */
3088         if (iter->st_bucket_done) {
3089                 st->offset = 0;
3090                 st->bucket++;
3091                 if (st->state == TCP_SEQ_STATE_LISTENING &&
3092                     st->bucket > hinfo->lhash2_mask) {
3093                         st->state = TCP_SEQ_STATE_ESTABLISHED;
3094                         st->bucket = 0;
3095                 }
3096         }
3097
3098 again:
3099         /* Get a new batch */
3100         iter->cur_sk = 0;
3101         iter->end_sk = 0;
3102         iter->st_bucket_done = false;
3103
3104         sk = tcp_seek_last_pos(seq);
3105         if (!sk)
3106                 return NULL; /* Done */
3107
3108         if (st->state == TCP_SEQ_STATE_LISTENING)
3109                 expected = bpf_iter_tcp_listening_batch(seq, sk);
3110         else
3111                 expected = bpf_iter_tcp_established_batch(seq, sk);
3112
3113         if (iter->end_sk == expected) {
3114                 iter->st_bucket_done = true;
3115                 return sk;
3116         }
3117
3118         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3119                 resized = true;
3120                 goto again;
3121         }
3122
3123         return sk;
3124 }
3125
3126 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3127 {
3128         /* bpf iter does not support lseek, so it always
3129          * continue from where it was stop()-ped.
3130          */
3131         if (*pos)
3132                 return bpf_iter_tcp_batch(seq);
3133
3134         return SEQ_START_TOKEN;
3135 }
3136
3137 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3138 {
3139         struct bpf_tcp_iter_state *iter = seq->private;
3140         struct tcp_iter_state *st = &iter->state;
3141         struct sock *sk;
3142
3143         /* Whenever seq_next() is called, the iter->cur_sk is
3144          * done with seq_show(), so advance to the next sk in
3145          * the batch.
3146          */
3147         if (iter->cur_sk < iter->end_sk) {
3148                 /* Keeping st->num consistent in tcp_iter_state.
3149                  * bpf_iter_tcp does not use st->num.
3150                  * meta.seq_num is used instead.
3151                  */
3152                 st->num++;
3153                 /* Move st->offset to the next sk in the bucket such that
3154                  * the future start() will resume at st->offset in
3155                  * st->bucket.  See tcp_seek_last_pos().
3156                  */
3157                 st->offset++;
3158                 sock_gen_put(iter->batch[iter->cur_sk++]);
3159         }
3160
3161         if (iter->cur_sk < iter->end_sk)
3162                 sk = iter->batch[iter->cur_sk];
3163         else
3164                 sk = bpf_iter_tcp_batch(seq);
3165
3166         ++*pos;
3167         /* Keeping st->last_pos consistent in tcp_iter_state.
3168          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3169          */
3170         st->last_pos = *pos;
3171         return sk;
3172 }
3173
3174 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3175 {
3176         struct bpf_iter_meta meta;
3177         struct bpf_prog *prog;
3178         struct sock *sk = v;
3179         uid_t uid;
3180         int ret;
3181
3182         if (v == SEQ_START_TOKEN)
3183                 return 0;
3184
3185         if (sk_fullsock(sk))
3186                 lock_sock(sk);
3187
3188         if (unlikely(sk_unhashed(sk))) {
3189                 ret = SEQ_SKIP;
3190                 goto unlock;
3191         }
3192
3193         if (sk->sk_state == TCP_TIME_WAIT) {
3194                 uid = 0;
3195         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3196                 const struct request_sock *req = v;
3197
3198                 uid = from_kuid_munged(seq_user_ns(seq),
3199                                        sock_i_uid(req->rsk_listener));
3200         } else {
3201                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3202         }
3203
3204         meta.seq = seq;
3205         prog = bpf_iter_get_info(&meta, false);
3206         ret = tcp_prog_seq_show(prog, &meta, v, uid);
3207
3208 unlock:
3209         if (sk_fullsock(sk))
3210                 release_sock(sk);
3211         return ret;
3212
3213 }
3214
3215 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3216 {
3217         struct bpf_tcp_iter_state *iter = seq->private;
3218         struct bpf_iter_meta meta;
3219         struct bpf_prog *prog;
3220
3221         if (!v) {
3222                 meta.seq = seq;
3223                 prog = bpf_iter_get_info(&meta, true);
3224                 if (prog)
3225                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
3226         }
3227
3228         if (iter->cur_sk < iter->end_sk) {
3229                 bpf_iter_tcp_put_batch(iter);
3230                 iter->st_bucket_done = false;
3231         }
3232 }
3233
3234 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3235         .show           = bpf_iter_tcp_seq_show,
3236         .start          = bpf_iter_tcp_seq_start,
3237         .next           = bpf_iter_tcp_seq_next,
3238         .stop           = bpf_iter_tcp_seq_stop,
3239 };
3240 #endif
3241 static unsigned short seq_file_family(const struct seq_file *seq)
3242 {
3243         const struct tcp_seq_afinfo *afinfo;
3244
3245 #ifdef CONFIG_BPF_SYSCALL
3246         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3247         if (seq->op == &bpf_iter_tcp_seq_ops)
3248                 return AF_UNSPEC;
3249 #endif
3250
3251         /* Iterated from proc fs */
3252         afinfo = pde_data(file_inode(seq->file));
3253         return afinfo->family;
3254 }
3255
3256 static const struct seq_operations tcp4_seq_ops = {
3257         .show           = tcp4_seq_show,
3258         .start          = tcp_seq_start,
3259         .next           = tcp_seq_next,
3260         .stop           = tcp_seq_stop,
3261 };
3262
3263 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3264         .family         = AF_INET,
3265 };
3266
3267 static int __net_init tcp4_proc_init_net(struct net *net)
3268 {
3269         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3270                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3271                 return -ENOMEM;
3272         return 0;
3273 }
3274
3275 static void __net_exit tcp4_proc_exit_net(struct net *net)
3276 {
3277         remove_proc_entry("tcp", net->proc_net);
3278 }
3279
3280 static struct pernet_operations tcp4_net_ops = {
3281         .init = tcp4_proc_init_net,
3282         .exit = tcp4_proc_exit_net,
3283 };
3284
3285 int __init tcp4_proc_init(void)
3286 {
3287         return register_pernet_subsys(&tcp4_net_ops);
3288 }
3289
3290 void tcp4_proc_exit(void)
3291 {
3292         unregister_pernet_subsys(&tcp4_net_ops);
3293 }
3294 #endif /* CONFIG_PROC_FS */
3295
3296 /* @wake is one when sk_stream_write_space() calls us.
3297  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3298  * This mimics the strategy used in sock_def_write_space().
3299  */
3300 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3301 {
3302         const struct tcp_sock *tp = tcp_sk(sk);
3303         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3304                             READ_ONCE(tp->snd_nxt);
3305
3306         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3307 }
3308 EXPORT_SYMBOL(tcp_stream_memory_free);
3309
3310 struct proto tcp_prot = {
3311         .name                   = "TCP",
3312         .owner                  = THIS_MODULE,
3313         .close                  = tcp_close,
3314         .pre_connect            = tcp_v4_pre_connect,
3315         .connect                = tcp_v4_connect,
3316         .disconnect             = tcp_disconnect,
3317         .accept                 = inet_csk_accept,
3318         .ioctl                  = tcp_ioctl,
3319         .init                   = tcp_v4_init_sock,
3320         .destroy                = tcp_v4_destroy_sock,
3321         .shutdown               = tcp_shutdown,
3322         .setsockopt             = tcp_setsockopt,
3323         .getsockopt             = tcp_getsockopt,
3324         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3325         .keepalive              = tcp_set_keepalive,
3326         .recvmsg                = tcp_recvmsg,
3327         .sendmsg                = tcp_sendmsg,
3328         .splice_eof             = tcp_splice_eof,
3329         .backlog_rcv            = tcp_v4_do_rcv,
3330         .release_cb             = tcp_release_cb,
3331         .hash                   = inet_hash,
3332         .unhash                 = inet_unhash,
3333         .get_port               = inet_csk_get_port,
3334         .put_port               = inet_put_port,
3335 #ifdef CONFIG_BPF_SYSCALL
3336         .psock_update_sk_prot   = tcp_bpf_update_proto,
3337 #endif
3338         .enter_memory_pressure  = tcp_enter_memory_pressure,
3339         .leave_memory_pressure  = tcp_leave_memory_pressure,
3340         .stream_memory_free     = tcp_stream_memory_free,
3341         .sockets_allocated      = &tcp_sockets_allocated,
3342         .orphan_count           = &tcp_orphan_count,
3343
3344         .memory_allocated       = &tcp_memory_allocated,
3345         .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3346
3347         .memory_pressure        = &tcp_memory_pressure,
3348         .sysctl_mem             = sysctl_tcp_mem,
3349         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3350         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3351         .max_header             = MAX_TCP_HEADER,
3352         .obj_size               = sizeof(struct tcp_sock),
3353         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3354         .twsk_prot              = &tcp_timewait_sock_ops,
3355         .rsk_prot               = &tcp_request_sock_ops,
3356         .h.hashinfo             = NULL,
3357         .no_autobind            = true,
3358         .diag_destroy           = tcp_abort,
3359 };
3360 EXPORT_SYMBOL(tcp_prot);
3361
3362 static void __net_exit tcp_sk_exit(struct net *net)
3363 {
3364         if (net->ipv4.tcp_congestion_control)
3365                 bpf_module_put(net->ipv4.tcp_congestion_control,
3366                                net->ipv4.tcp_congestion_control->owner);
3367 }
3368
3369 static void __net_init tcp_set_hashinfo(struct net *net)
3370 {
3371         struct inet_hashinfo *hinfo;
3372         unsigned int ehash_entries;
3373         struct net *old_net;
3374
3375         if (net_eq(net, &init_net))
3376                 goto fallback;
3377
3378         old_net = current->nsproxy->net_ns;
3379         ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3380         if (!ehash_entries)
3381                 goto fallback;
3382
3383         ehash_entries = roundup_pow_of_two(ehash_entries);
3384         hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3385         if (!hinfo) {
3386                 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3387                         "for a netns, fallback to the global one\n",
3388                         ehash_entries);
3389 fallback:
3390                 hinfo = &tcp_hashinfo;
3391                 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3392         }
3393
3394         net->ipv4.tcp_death_row.hashinfo = hinfo;
3395         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3396         net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3397 }
3398
3399 static int __net_init tcp_sk_init(struct net *net)
3400 {
3401         net->ipv4.sysctl_tcp_ecn = 2;
3402         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3403
3404         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3405         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3406         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3407         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3408         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3409
3410         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3411         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3412         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3413
3414         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3415         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3416         net->ipv4.sysctl_tcp_syncookies = 1;
3417         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3418         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3419         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3420         net->ipv4.sysctl_tcp_orphan_retries = 0;
3421         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3422         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3423         net->ipv4.sysctl_tcp_tw_reuse = 2;
3424         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3425
3426         refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3427         tcp_set_hashinfo(net);
3428
3429         net->ipv4.sysctl_tcp_sack = 1;
3430         net->ipv4.sysctl_tcp_window_scaling = 1;
3431         net->ipv4.sysctl_tcp_timestamps = 1;
3432         net->ipv4.sysctl_tcp_early_retrans = 3;
3433         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3434         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3435         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3436         net->ipv4.sysctl_tcp_max_reordering = 300;
3437         net->ipv4.sysctl_tcp_dsack = 1;
3438         net->ipv4.sysctl_tcp_app_win = 31;
3439         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3440         net->ipv4.sysctl_tcp_frto = 2;
3441         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3442         /* This limits the percentage of the congestion window which we
3443          * will allow a single TSO frame to consume.  Building TSO frames
3444          * which are too large can cause TCP streams to be bursty.
3445          */
3446         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3447         /* Default TSQ limit of 16 TSO segments */
3448         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3449
3450         /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3451         net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3452
3453         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3454         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3455         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3456         net->ipv4.sysctl_tcp_autocorking = 1;
3457         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3458         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3459         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3460         if (net != &init_net) {
3461                 memcpy(net->ipv4.sysctl_tcp_rmem,
3462                        init_net.ipv4.sysctl_tcp_rmem,
3463                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3464                 memcpy(net->ipv4.sysctl_tcp_wmem,
3465                        init_net.ipv4.sysctl_tcp_wmem,
3466                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3467         }
3468         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3469         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3470         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3471         net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3472         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3473         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3474         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3475
3476         /* Set default values for PLB */
3477         net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3478         net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3479         net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3480         net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3481         /* Default congestion threshold for PLB to mark a round is 50% */
3482         net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3483
3484         /* Reno is always built in */
3485         if (!net_eq(net, &init_net) &&
3486             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3487                                init_net.ipv4.tcp_congestion_control->owner))
3488                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3489         else
3490                 net->ipv4.tcp_congestion_control = &tcp_reno;
3491
3492         net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3493         net->ipv4.sysctl_tcp_shrink_window = 0;
3494
3495         net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3496
3497         return 0;
3498 }
3499
3500 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3501 {
3502         struct net *net;
3503
3504         tcp_twsk_purge(net_exit_list, AF_INET);
3505
3506         list_for_each_entry(net, net_exit_list, exit_list) {
3507                 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3508                 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3509                 tcp_fastopen_ctx_destroy(net);
3510         }
3511 }
3512
3513 static struct pernet_operations __net_initdata tcp_sk_ops = {
3514        .init       = tcp_sk_init,
3515        .exit       = tcp_sk_exit,
3516        .exit_batch = tcp_sk_exit_batch,
3517 };
3518
3519 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3520 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3521                      struct sock_common *sk_common, uid_t uid)
3522
3523 #define INIT_BATCH_SZ 16
3524
3525 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3526 {
3527         struct bpf_tcp_iter_state *iter = priv_data;
3528         int err;
3529
3530         err = bpf_iter_init_seq_net(priv_data, aux);
3531         if (err)
3532                 return err;
3533
3534         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3535         if (err) {
3536                 bpf_iter_fini_seq_net(priv_data);
3537                 return err;
3538         }
3539
3540         return 0;
3541 }
3542
3543 static void bpf_iter_fini_tcp(void *priv_data)
3544 {
3545         struct bpf_tcp_iter_state *iter = priv_data;
3546
3547         bpf_iter_fini_seq_net(priv_data);
3548         kvfree(iter->batch);
3549 }
3550
3551 static const struct bpf_iter_seq_info tcp_seq_info = {
3552         .seq_ops                = &bpf_iter_tcp_seq_ops,
3553         .init_seq_private       = bpf_iter_init_tcp,
3554         .fini_seq_private       = bpf_iter_fini_tcp,
3555         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3556 };
3557
3558 static const struct bpf_func_proto *
3559 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3560                             const struct bpf_prog *prog)
3561 {
3562         switch (func_id) {
3563         case BPF_FUNC_setsockopt:
3564                 return &bpf_sk_setsockopt_proto;
3565         case BPF_FUNC_getsockopt:
3566                 return &bpf_sk_getsockopt_proto;
3567         default:
3568                 return NULL;
3569         }
3570 }
3571
3572 static struct bpf_iter_reg tcp_reg_info = {
3573         .target                 = "tcp",
3574         .ctx_arg_info_size      = 1,
3575         .ctx_arg_info           = {
3576                 { offsetof(struct bpf_iter__tcp, sk_common),
3577                   PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3578         },
3579         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3580         .seq_info               = &tcp_seq_info,
3581 };
3582
3583 static void __init bpf_iter_register(void)
3584 {
3585         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3586         if (bpf_iter_reg_target(&tcp_reg_info))
3587                 pr_warn("Warning: could not register bpf iterator tcp\n");
3588 }
3589
3590 #endif
3591
3592 void __init tcp_v4_init(void)
3593 {
3594         int cpu, res;
3595
3596         for_each_possible_cpu(cpu) {
3597                 struct sock *sk;
3598
3599                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3600                                            IPPROTO_TCP, &init_net);
3601                 if (res)
3602                         panic("Failed to create the TCP control socket.\n");
3603                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3604
3605                 /* Please enforce IP_DF and IPID==0 for RST and
3606                  * ACK sent in SYN-RECV and TIME-WAIT state.
3607                  */
3608                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3609
3610                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3611         }
3612         if (register_pernet_subsys(&tcp_sk_ops))
3613                 panic("Failed to create the TCP control socket.\n");
3614
3615 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3616         bpf_iter_register();
3617 #endif
3618 }