net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 #include <linux/btf_ids.h>
  80
  81 #include <crypto/hash.h>
  82 #include <linux/scatterlist.h>
  83
  84 #include <trace/events/tcp.h>
  85
  86 #ifdef CONFIG_TCP_MD5SIG
  87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89 #endif
  90
  91 struct inet_hashinfo tcp_hashinfo;
  92 EXPORT_SYMBOL(tcp_hashinfo);
  93
  94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
  95
  96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  97 {
  98         return secure_tcp_seq(ip_hdr(skb)->daddr,
  99                               ip_hdr(skb)->saddr,
 100                               tcp_hdr(skb)->dest,
 101                               tcp_hdr(skb)->source);
 102 }
 103
 104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 105 {
 106         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
 112         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 114         struct tcp_sock *tp = tcp_sk(sk);
 115
 116         if (reuse == 2) {
 117                 /* Still does not detect *everything* that goes through
 118                  * lo, since we require a loopback src or dst address
 119                  * or direct binding to 'lo' interface.
 120                  */
 121                 bool loopback = false;
 122                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 123                         loopback = true;
 124 #if IS_ENABLED(CONFIG_IPV6)
 125                 if (tw->tw_family == AF_INET6) {
 126                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 128                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 129                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 130                                 loopback = true;
 131                 } else
 132 #endif
 133                 {
 134                         if (ipv4_is_loopback(tw->tw_daddr) ||
 135                             ipv4_is_loopback(tw->tw_rcv_saddr))
 136                                 loopback = true;
 137                 }
 138                 if (!loopback)
 139                         reuse = 0;
 140         }
 141
 142         /* With PAWS, it is safe from the viewpoint
 143            of data integrity. Even without PAWS it is safe provided sequence
 144            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 145
 146            Actually, the idea is close to VJ's one, only timestamp cache is
 147            held not per host, but per port pair and TW bucket is used as state
 148            holder.
 149
 150            If TW bucket has been already destroyed we fall back to VJ's scheme
 151            and use initial timestamp retrieved from peer table.
 152          */
 153         if (tcptw->tw_ts_recent_stamp &&
 154             (!twp || (reuse && time_after32(ktime_get_seconds(),
 155                                             tcptw->tw_ts_recent_stamp)))) {
 156                 /* In case of repair and re-using TIME-WAIT sockets we still
 157                  * want to be sure that it is safe as above but honor the
 158                  * sequence numbers and time stamps set as part of the repair
 159                  * process.
 160                  *
 161                  * Without this check re-using a TIME-WAIT socket with TCP
 162                  * repair would accumulate a -1 on the repair assigned
 163                  * sequence number. The first time it is reused the sequence
 164                  * is -1, the second time -2, etc. This fixes that issue
 165                  * without appearing to create any others.
 166                  */
 167                 if (likely(!tp->repair)) {
 168                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 169
 170                         if (!seq)
 171                                 seq = 1;
 172                         WRITE_ONCE(tp->write_seq, seq);
 173                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 174                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 175                 }
 176                 sock_hold(sktw);
 177                 return 1;
 178         }
 179
 180         return 0;
 181 }
 182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 183
 184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 185                               int addr_len)
 186 {
 187         /* This check is replicated from tcp_v4_connect() and intended to
 188          * prevent BPF program called below from accessing bytes that are out
 189          * of the bound specified by user in addr_len.
 190          */
 191         if (addr_len < sizeof(struct sockaddr_in))
 192                 return -EINVAL;
 193
 194         sock_owned_by_me(sk);
 195
 196         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 197 }
 198
 199 /* This will initiate an outgoing connection. */
 200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 201 {
 202         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 203         struct inet_sock *inet = inet_sk(sk);
 204         struct tcp_sock *tp = tcp_sk(sk);
 205         __be16 orig_sport, orig_dport;
 206         __be32 daddr, nexthop;
 207         struct flowi4 *fl4;
 208         struct rtable *rt;
 209         int err;
 210         struct ip_options_rcu *inet_opt;
 211         struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
 212
 213         if (addr_len < sizeof(struct sockaddr_in))
 214                 return -EINVAL;
 215
 216         if (usin->sin_family != AF_INET)
 217                 return -EAFNOSUPPORT;
 218
 219         nexthop = daddr = usin->sin_addr.s_addr;
 220         inet_opt = rcu_dereference_protected(inet->inet_opt,
 221                                              lockdep_sock_is_held(sk));
 222         if (inet_opt && inet_opt->opt.srr) {
 223                 if (!daddr)
 224                         return -EINVAL;
 225                 nexthop = inet_opt->opt.faddr;
 226         }
 227
 228         orig_sport = inet->inet_sport;
 229         orig_dport = usin->sin_port;
 230         fl4 = &inet->cork.fl.u.ip4;
 231         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 232                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
 233                               orig_dport, sk);
 234         if (IS_ERR(rt)) {
 235                 err = PTR_ERR(rt);
 236                 if (err == -ENETUNREACH)
 237                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 238                 return err;
 239         }
 240
 241         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 242                 ip_rt_put(rt);
 243                 return -ENETUNREACH;
 244         }
 245
 246         if (!inet_opt || !inet_opt->opt.srr)
 247                 daddr = fl4->daddr;
 248
 249         if (!inet->inet_saddr)
 250                 inet->inet_saddr = fl4->saddr;
 251         sk_rcv_saddr_set(sk, inet->inet_saddr);
 252
 253         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 254                 /* Reset inherited state */
 255                 tp->rx_opt.ts_recent       = 0;
 256                 tp->rx_opt.ts_recent_stamp = 0;
 257                 if (likely(!tp->repair))
 258                         WRITE_ONCE(tp->write_seq, 0);
 259         }
 260
 261         inet->inet_dport = usin->sin_port;
 262         sk_daddr_set(sk, daddr);
 263
 264         inet_csk(sk)->icsk_ext_hdr_len = 0;
 265         if (inet_opt)
 266                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 267
 268         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 269
 270         /* Socket identity is still unknown (sport may be zero).
 271          * However we set state to SYN-SENT and not releasing socket
 272          * lock select source port, enter ourselves into the hash tables and
 273          * complete initialization after this.
 274          */
 275         tcp_set_state(sk, TCP_SYN_SENT);
 276         err = inet_hash_connect(tcp_death_row, sk);
 277         if (err)
 278                 goto failure;
 279
 280         sk_set_txhash(sk);
 281
 282         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 283                                inet->inet_sport, inet->inet_dport, sk);
 284         if (IS_ERR(rt)) {
 285                 err = PTR_ERR(rt);
 286                 rt = NULL;
 287                 goto failure;
 288         }
 289         /* OK, now commit destination to socket.  */
 290         sk->sk_gso_type = SKB_GSO_TCPV4;
 291         sk_setup_caps(sk, &rt->dst);
 292         rt = NULL;
 293
 294         if (likely(!tp->repair)) {
 295                 if (!tp->write_seq)
 296                         WRITE_ONCE(tp->write_seq,
 297                                    secure_tcp_seq(inet->inet_saddr,
 298                                                   inet->inet_daddr,
 299                                                   inet->inet_sport,
 300                                                   usin->sin_port));
 301                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 302                                                  inet->inet_saddr,
 303                                                  inet->inet_daddr);
 304         }
 305
 306         inet->inet_id = prandom_u32();
 307
 308         if (tcp_fastopen_defer_connect(sk, &err))
 309                 return err;
 310         if (err)
 311                 goto failure;
 312
 313         err = tcp_connect(sk);
 314
 315         if (err)
 316                 goto failure;
 317
 318         return 0;
 319
 320 failure:
 321         /*
 322          * This unhashes the socket and releases the local port,
 323          * if necessary.
 324          */
 325         tcp_set_state(sk, TCP_CLOSE);
 326         ip_rt_put(rt);
 327         sk->sk_route_caps = 0;
 328         inet->inet_dport = 0;
 329         return err;
 330 }
 331 EXPORT_SYMBOL(tcp_v4_connect);
 332
 333 /*
 334  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 335  * It can be called through tcp_release_cb() if socket was owned by user
 336  * at the time tcp_v4_err() was called to handle ICMP message.
 337  */
 338 void tcp_v4_mtu_reduced(struct sock *sk)
 339 {
 340         struct inet_sock *inet = inet_sk(sk);
 341         struct dst_entry *dst;
 342         u32 mtu;
 343
 344         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 345                 return;
 346         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 347         dst = inet_csk_update_pmtu(sk, mtu);
 348         if (!dst)
 349                 return;
 350
 351         /* Something is about to be wrong... Remember soft error
 352          * for the case, if this connection will not able to recover.
 353          */
 354         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 355                 sk->sk_err_soft = EMSGSIZE;
 356
 357         mtu = dst_mtu(dst);
 358
 359         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 360             ip_sk_accept_pmtu(sk) &&
 361             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 362                 tcp_sync_mss(sk, mtu);
 363
 364                 /* Resend the TCP packet because it's
 365                  * clear that the old packet has been
 366                  * dropped. This is the new "fast" path mtu
 367                  * discovery.
 368                  */
 369                 tcp_simple_retransmit(sk);
 370         } /* else let the usual retransmit timer handle it */
 371 }
 372 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 373
 374 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 375 {
 376         struct dst_entry *dst = __sk_dst_check(sk, 0);
 377
 378         if (dst)
 379                 dst->ops->redirect(dst, sk, skb);
 380 }
 381
 382
 383 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 384 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 385 {
 386         struct request_sock *req = inet_reqsk(sk);
 387         struct net *net = sock_net(sk);
 388
 389         /* ICMPs are not backlogged, hence we cannot get
 390          * an established socket here.
 391          */
 392         if (seq != tcp_rsk(req)->snt_isn) {
 393                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 394         } else if (abort) {
 395                 /*
 396                  * Still in SYN_RECV, just remove it silently.
 397                  * There is no good way to pass the error to the newly
 398                  * created socket, and POSIX does not want network
 399                  * errors returned from accept().
 400                  */
 401                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 402                 tcp_listendrop(req->rsk_listener);
 403         }
 404         reqsk_put(req);
 405 }
 406 EXPORT_SYMBOL(tcp_req_err);
 407
 408 /* TCP-LD (RFC 6069) logic */
 409 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 410 {
 411         struct inet_connection_sock *icsk = inet_csk(sk);
 412         struct tcp_sock *tp = tcp_sk(sk);
 413         struct sk_buff *skb;
 414         s32 remaining;
 415         u32 delta_us;
 416
 417         if (sock_owned_by_user(sk))
 418                 return;
 419
 420         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 421             !icsk->icsk_backoff)
 422                 return;
 423
 424         skb = tcp_rtx_queue_head(sk);
 425         if (WARN_ON_ONCE(!skb))
 426                 return;
 427
 428         icsk->icsk_backoff--;
 429         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 430         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 431
 432         tcp_mstamp_refresh(tp);
 433         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 434         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 435
 436         if (remaining > 0) {
 437                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 438                                           remaining, TCP_RTO_MAX);
 439         } else {
 440                 /* RTO revert clocked out retransmission.
 441                  * Will retransmit now.
 442                  */
 443                 tcp_retransmit_timer(sk);
 444         }
 445 }
 446 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 447
 448 /*
 449  * This routine is called by the ICMP module when it gets some
 450  * sort of error condition.  If err < 0 then the socket should
 451  * be closed and the error returned to the user.  If err > 0
 452  * it's just the icmp type << 8 | icmp code.  After adjustment
 453  * header points to the first 8 bytes of the tcp header.  We need
 454  * to find the appropriate port.
 455  *
 456  * The locking strategy used here is very "optimistic". When
 457  * someone else accesses the socket the ICMP is just dropped
 458  * and for some paths there is no check at all.
 459  * A more general error queue to queue errors for later handling
 460  * is probably better.
 461  *
 462  */
 463
 464 int tcp_v4_err(struct sk_buff *skb, u32 info)
 465 {
 466         const struct iphdr *iph = (const struct iphdr *)skb->data;
 467         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 468         struct tcp_sock *tp;
 469         struct inet_sock *inet;
 470         const int type = icmp_hdr(skb)->type;
 471         const int code = icmp_hdr(skb)->code;
 472         struct sock *sk;
 473         struct request_sock *fastopen;
 474         u32 seq, snd_una;
 475         int err;
 476         struct net *net = dev_net(skb->dev);
 477
 478         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 479                                        th->dest, iph->saddr, ntohs(th->source),
 480                                        inet_iif(skb), 0);
 481         if (!sk) {
 482                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 483                 return -ENOENT;
 484         }
 485         if (sk->sk_state == TCP_TIME_WAIT) {
 486                 inet_twsk_put(inet_twsk(sk));
 487                 return 0;
 488         }
 489         seq = ntohl(th->seq);
 490         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 491                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 492                                      type == ICMP_TIME_EXCEEDED ||
 493                                      (type == ICMP_DEST_UNREACH &&
 494                                       (code == ICMP_NET_UNREACH ||
 495                                        code == ICMP_HOST_UNREACH)));
 496                 return 0;
 497         }
 498
 499         bh_lock_sock(sk);
 500         /* If too many ICMPs get dropped on busy
 501          * servers this needs to be solved differently.
 502          * We do take care of PMTU discovery (RFC1191) special case :
 503          * we can receive locally generated ICMP messages while socket is held.
 504          */
 505         if (sock_owned_by_user(sk)) {
 506                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 507                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 508         }
 509         if (sk->sk_state == TCP_CLOSE)
 510                 goto out;
 511
 512         if (static_branch_unlikely(&ip4_min_ttl)) {
 513                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
 514                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
 515                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 516                         goto out;
 517                 }
 518         }
 519
 520         tp = tcp_sk(sk);
 521         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 522         fastopen = rcu_dereference(tp->fastopen_rsk);
 523         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 524         if (sk->sk_state != TCP_LISTEN &&
 525             !between(seq, snd_una, tp->snd_nxt)) {
 526                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 527                 goto out;
 528         }
 529
 530         switch (type) {
 531         case ICMP_REDIRECT:
 532                 if (!sock_owned_by_user(sk))
 533                         do_redirect(skb, sk);
 534                 goto out;
 535         case ICMP_SOURCE_QUENCH:
 536                 /* Just silently ignore these. */
 537                 goto out;
 538         case ICMP_PARAMETERPROB:
 539                 err = EPROTO;
 540                 break;
 541         case ICMP_DEST_UNREACH:
 542                 if (code > NR_ICMP_UNREACH)
 543                         goto out;
 544
 545                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 546                         /* We are not interested in TCP_LISTEN and open_requests
 547                          * (SYN-ACKs send out by Linux are always <576bytes so
 548                          * they should go through unfragmented).
 549                          */
 550                         if (sk->sk_state == TCP_LISTEN)
 551                                 goto out;
 552
 553                         WRITE_ONCE(tp->mtu_info, info);
 554                         if (!sock_owned_by_user(sk)) {
 555                                 tcp_v4_mtu_reduced(sk);
 556                         } else {
 557                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 558                                         sock_hold(sk);
 559                         }
 560                         goto out;
 561                 }
 562
 563                 err = icmp_err_convert[code].errno;
 564                 /* check if this ICMP message allows revert of backoff.
 565                  * (see RFC 6069)
 566                  */
 567                 if (!fastopen &&
 568                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 569                         tcp_ld_RTO_revert(sk, seq);
 570                 break;
 571         case ICMP_TIME_EXCEEDED:
 572                 err = EHOSTUNREACH;
 573                 break;
 574         default:
 575                 goto out;
 576         }
 577
 578         switch (sk->sk_state) {
 579         case TCP_SYN_SENT:
 580         case TCP_SYN_RECV:
 581                 /* Only in fast or simultaneous open. If a fast open socket is
 582                  * already accepted it is treated as a connected one below.
 583                  */
 584                 if (fastopen && !fastopen->sk)
 585                         break;
 586
 587                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 588
 589                 if (!sock_owned_by_user(sk)) {
 590                         sk->sk_err = err;
 591
 592                         sk_error_report(sk);
 593
 594                         tcp_done(sk);
 595                 } else {
 596                         sk->sk_err_soft = err;
 597                 }
 598                 goto out;
 599         }
 600
 601         /* If we've already connected we will keep trying
 602          * until we time out, or the user gives up.
 603          *
 604          * rfc1122 4.2.3.9 allows to consider as hard errors
 605          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 606          * but it is obsoleted by pmtu discovery).
 607          *
 608          * Note, that in modern internet, where routing is unreliable
 609          * and in each dark corner broken firewalls sit, sending random
 610          * errors ordered by their masters even this two messages finally lose
 611          * their original sense (even Linux sends invalid PORT_UNREACHs)
 612          *
 613          * Now we are in compliance with RFCs.
 614          *                                                      --ANK (980905)
 615          */
 616
 617         inet = inet_sk(sk);
 618         if (!sock_owned_by_user(sk) && inet->recverr) {
 619                 sk->sk_err = err;
 620                 sk_error_report(sk);
 621         } else  { /* Only an error on timeout */
 622                 sk->sk_err_soft = err;
 623         }
 624
 625 out:
 626         bh_unlock_sock(sk);
 627         sock_put(sk);
 628         return 0;
 629 }
 630
 631 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 632 {
 633         struct tcphdr *th = tcp_hdr(skb);
 634
 635         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 636         skb->csum_start = skb_transport_header(skb) - skb->head;
 637         skb->csum_offset = offsetof(struct tcphdr, check);
 638 }
 639
 640 /* This routine computes an IPv4 TCP checksum. */
 641 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 642 {
 643         const struct inet_sock *inet = inet_sk(sk);
 644
 645         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 646 }
 647 EXPORT_SYMBOL(tcp_v4_send_check);
 648
 649 /*
 650  *      This routine will send an RST to the other tcp.
 651  *
 652  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 653  *                    for reset.
 654  *      Answer: if a packet caused RST, it is not for a socket
 655  *              existing in our system, if it is matched to a socket,
 656  *              it is just duplicate segment or bug in other side's TCP.
 657  *              So that we build reply only basing on parameters
 658  *              arrived with segment.
 659  *      Exception: precedence violation. We do not implement it in any case.
 660  */
 661
 662 #ifdef CONFIG_TCP_MD5SIG
 663 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 664 #else
 665 #define OPTION_BYTES sizeof(__be32)
 666 #endif
 667
 668 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 669 {
 670         const struct tcphdr *th = tcp_hdr(skb);
 671         struct {
 672                 struct tcphdr th;
 673                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
 674         } rep;
 675         struct ip_reply_arg arg;
 676 #ifdef CONFIG_TCP_MD5SIG
 677         struct tcp_md5sig_key *key = NULL;
 678         const __u8 *hash_location = NULL;
 679         unsigned char newhash[16];
 680         int genhash;
 681         struct sock *sk1 = NULL;
 682 #endif
 683         u64 transmit_time = 0;
 684         struct sock *ctl_sk;
 685         struct net *net;
 686
 687         /* Never send a reset in response to a reset. */
 688         if (th->rst)
 689                 return;
 690
 691         /* If sk not NULL, it means we did a successful lookup and incoming
 692          * route had to be correct. prequeue might have dropped our dst.
 693          */
 694         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 695                 return;
 696
 697         /* Swap the send and the receive. */
 698         memset(&rep, 0, sizeof(rep));
 699         rep.th.dest   = th->source;
 700         rep.th.source = th->dest;
 701         rep.th.doff   = sizeof(struct tcphdr) / 4;
 702         rep.th.rst    = 1;
 703
 704         if (th->ack) {
 705                 rep.th.seq = th->ack_seq;
 706         } else {
 707                 rep.th.ack = 1;
 708                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 709                                        skb->len - (th->doff << 2));
 710         }
 711
 712         memset(&arg, 0, sizeof(arg));
 713         arg.iov[0].iov_base = (unsigned char *)&rep;
 714         arg.iov[0].iov_len  = sizeof(rep.th);
 715
 716         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 717 #ifdef CONFIG_TCP_MD5SIG
 718         rcu_read_lock();
 719         hash_location = tcp_parse_md5sig_option(th);
 720         if (sk && sk_fullsock(sk)) {
 721                 const union tcp_md5_addr *addr;
 722                 int l3index;
 723
 724                 /* sdif set, means packet ingressed via a device
 725                  * in an L3 domain and inet_iif is set to it.
 726                  */
 727                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 728                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 729                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 730         } else if (hash_location) {
 731                 const union tcp_md5_addr *addr;
 732                 int sdif = tcp_v4_sdif(skb);
 733                 int dif = inet_iif(skb);
 734                 int l3index;
 735
 736                 /*
 737                  * active side is lost. Try to find listening socket through
 738                  * source port, and then find md5 key through listening socket.
 739                  * we are not loose security here:
 740                  * Incoming packet is checked with md5 hash with finding key,
 741                  * no RST generated if md5 hash doesn't match.
 742                  */
 743                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 744                                              ip_hdr(skb)->saddr,
 745                                              th->source, ip_hdr(skb)->daddr,
 746                                              ntohs(th->source), dif, sdif);
 747                 /* don't send rst if it can't find key */
 748                 if (!sk1)
 749                         goto out;
 750
 751                 /* sdif set, means packet ingressed via a device
 752                  * in an L3 domain and dif is set to it.
 753                  */
 754                 l3index = sdif ? dif : 0;
 755                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 756                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 757                 if (!key)
 758                         goto out;
 759
 760
 761                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 762                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 763                         goto out;
 764
 765         }
 766
 767         if (key) {
 768                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 769                                    (TCPOPT_NOP << 16) |
 770                                    (TCPOPT_MD5SIG << 8) |
 771                                    TCPOLEN_MD5SIG);
 772                 /* Update length and the length the header thinks exists */
 773                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 774                 rep.th.doff = arg.iov[0].iov_len / 4;
 775
 776                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 777                                      key, ip_hdr(skb)->saddr,
 778                                      ip_hdr(skb)->daddr, &rep.th);
 779         }
 780 #endif
 781         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 782         if (rep.opt[0] == 0) {
 783                 __be32 mrst = mptcp_reset_option(skb);
 784
 785                 if (mrst) {
 786                         rep.opt[0] = mrst;
 787                         arg.iov[0].iov_len += sizeof(mrst);
 788                         rep.th.doff = arg.iov[0].iov_len / 4;
 789                 }
 790         }
 791
 792         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 793                                       ip_hdr(skb)->saddr, /* XXX */
 794                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 795         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 796         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 797
 798         /* When socket is gone, all binding information is lost.
 799          * routing might fail in this case. No choice here, if we choose to force
 800          * input interface, we will misroute in case of asymmetric route.
 801          */
 802         if (sk) {
 803                 arg.bound_dev_if = sk->sk_bound_dev_if;
 804                 if (sk_fullsock(sk))
 805                         trace_tcp_send_reset(sk, skb);
 806         }
 807
 808         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 809                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 810
 811         arg.tos = ip_hdr(skb)->tos;
 812         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 813         local_bh_disable();
 814         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 815         sock_net_set(ctl_sk, net);
 816         if (sk) {
 817                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 818                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 819                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 820                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 821                 transmit_time = tcp_transmit_time(sk);
 822                 xfrm_sk_clone_policy(ctl_sk, sk);
 823         }
 824         ip_send_unicast_reply(ctl_sk,
 825                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 826                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 827                               &arg, arg.iov[0].iov_len,
 828                               transmit_time);
 829
 830         ctl_sk->sk_mark = 0;
 831         xfrm_sk_free_policy(ctl_sk);
 832         sock_net_set(ctl_sk, &init_net);
 833         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 834         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 835         local_bh_enable();
 836
 837 #ifdef CONFIG_TCP_MD5SIG
 838 out:
 839         rcu_read_unlock();
 840 #endif
 841 }
 842
 843 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 844    outside socket context is ugly, certainly. What can I do?
 845  */
 846
 847 static void tcp_v4_send_ack(const struct sock *sk,
 848                             struct sk_buff *skb, u32 seq, u32 ack,
 849                             u32 win, u32 tsval, u32 tsecr, int oif,
 850                             struct tcp_md5sig_key *key,
 851                             int reply_flags, u8 tos)
 852 {
 853         const struct tcphdr *th = tcp_hdr(skb);
 854         struct {
 855                 struct tcphdr th;
 856                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 857 #ifdef CONFIG_TCP_MD5SIG
 858                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 859 #endif
 860                         ];
 861         } rep;
 862         struct net *net = sock_net(sk);
 863         struct ip_reply_arg arg;
 864         struct sock *ctl_sk;
 865         u64 transmit_time;
 866
 867         memset(&rep.th, 0, sizeof(struct tcphdr));
 868         memset(&arg, 0, sizeof(arg));
 869
 870         arg.iov[0].iov_base = (unsigned char *)&rep;
 871         arg.iov[0].iov_len  = sizeof(rep.th);
 872         if (tsecr) {
 873                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 874                                    (TCPOPT_TIMESTAMP << 8) |
 875                                    TCPOLEN_TIMESTAMP);
 876                 rep.opt[1] = htonl(tsval);
 877                 rep.opt[2] = htonl(tsecr);
 878                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 879         }
 880
 881         /* Swap the send and the receive. */
 882         rep.th.dest    = th->source;
 883         rep.th.source  = th->dest;
 884         rep.th.doff    = arg.iov[0].iov_len / 4;
 885         rep.th.seq     = htonl(seq);
 886         rep.th.ack_seq = htonl(ack);
 887         rep.th.ack     = 1;
 888         rep.th.window  = htons(win);
 889
 890 #ifdef CONFIG_TCP_MD5SIG
 891         if (key) {
 892                 int offset = (tsecr) ? 3 : 0;
 893
 894                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 895                                           (TCPOPT_NOP << 16) |
 896                                           (TCPOPT_MD5SIG << 8) |
 897                                           TCPOLEN_MD5SIG);
 898                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 899                 rep.th.doff = arg.iov[0].iov_len/4;
 900
 901                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 902                                     key, ip_hdr(skb)->saddr,
 903                                     ip_hdr(skb)->daddr, &rep.th);
 904         }
 905 #endif
 906         arg.flags = reply_flags;
 907         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 908                                       ip_hdr(skb)->saddr, /* XXX */
 909                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 910         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 911         if (oif)
 912                 arg.bound_dev_if = oif;
 913         arg.tos = tos;
 914         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 915         local_bh_disable();
 916         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 917         sock_net_set(ctl_sk, net);
 918         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 919                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 920         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 921                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 922         transmit_time = tcp_transmit_time(sk);
 923         ip_send_unicast_reply(ctl_sk,
 924                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 925                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 926                               &arg, arg.iov[0].iov_len,
 927                               transmit_time);
 928
 929         ctl_sk->sk_mark = 0;
 930         sock_net_set(ctl_sk, &init_net);
 931         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 932         local_bh_enable();
 933 }
 934
 935 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 936 {
 937         struct inet_timewait_sock *tw = inet_twsk(sk);
 938         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 939
 940         tcp_v4_send_ack(sk, skb,
 941                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 942                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 943                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 944                         tcptw->tw_ts_recent,
 945                         tw->tw_bound_dev_if,
 946                         tcp_twsk_md5_key(tcptw),
 947                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 948                         tw->tw_tos
 949                         );
 950
 951         inet_twsk_put(tw);
 952 }
 953
 954 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 955                                   struct request_sock *req)
 956 {
 957         const union tcp_md5_addr *addr;
 958         int l3index;
 959
 960         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 961          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 962          */
 963         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 964                                              tcp_sk(sk)->snd_nxt;
 965
 966         /* RFC 7323 2.3
 967          * The window field (SEG.WND) of every outgoing segment, with the
 968          * exception of <SYN> segments, MUST be right-shifted by
 969          * Rcv.Wind.Shift bits:
 970          */
 971         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 972         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 973         tcp_v4_send_ack(sk, skb, seq,
 974                         tcp_rsk(req)->rcv_nxt,
 975                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 976                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 977                         req->ts_recent,
 978                         0,
 979                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 980                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 981                         ip_hdr(skb)->tos);
 982 }
 983
 984 /*
 985  *      Send a SYN-ACK after having received a SYN.
 986  *      This still operates on a request_sock only, not on a big
 987  *      socket.
 988  */
 989 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 990                               struct flowi *fl,
 991                               struct request_sock *req,
 992                               struct tcp_fastopen_cookie *foc,
 993                               enum tcp_synack_type synack_type,
 994                               struct sk_buff *syn_skb)
 995 {
 996         const struct inet_request_sock *ireq = inet_rsk(req);
 997         struct flowi4 fl4;
 998         int err = -1;
 999         struct sk_buff *skb;
1000         u8 tos;
1001
1002         /* First, grab a route. */
1003         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1004                 return -1;
1005
1006         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1007
1008         if (skb) {
1009                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1010
1011                 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1012                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1013                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1014                                 inet_sk(sk)->tos;
1015
1016                 if (!INET_ECN_is_capable(tos) &&
1017                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1018                         tos |= INET_ECN_ECT_0;
1019
1020                 rcu_read_lock();
1021                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1022                                             ireq->ir_rmt_addr,
1023                                             rcu_dereference(ireq->ireq_opt),
1024                                             tos);
1025                 rcu_read_unlock();
1026                 err = net_xmit_eval(err);
1027         }
1028
1029         return err;
1030 }
1031
1032 /*
1033  *      IPv4 request_sock destructor.
1034  */
1035 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1036 {
1037         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1038 }
1039
1040 #ifdef CONFIG_TCP_MD5SIG
1041 /*
1042  * RFC2385 MD5 checksumming requires a mapping of
1043  * IP address->MD5 Key.
1044  * We need to maintain these in the sk structure.
1045  */
1046
1047 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1048 EXPORT_SYMBOL(tcp_md5_needed);
1049
1050 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1051 {
1052         if (!old)
1053                 return true;
1054
1055         /* l3index always overrides non-l3index */
1056         if (old->l3index && new->l3index == 0)
1057                 return false;
1058         if (old->l3index == 0 && new->l3index)
1059                 return true;
1060
1061         return old->prefixlen < new->prefixlen;
1062 }
1063
1064 /* Find the Key structure for an address.  */
1065 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1066                                            const union tcp_md5_addr *addr,
1067                                            int family)
1068 {
1069         const struct tcp_sock *tp = tcp_sk(sk);
1070         struct tcp_md5sig_key *key;
1071         const struct tcp_md5sig_info *md5sig;
1072         __be32 mask;
1073         struct tcp_md5sig_key *best_match = NULL;
1074         bool match;
1075
1076         /* caller either holds rcu_read_lock() or socket lock */
1077         md5sig = rcu_dereference_check(tp->md5sig_info,
1078                                        lockdep_sock_is_held(sk));
1079         if (!md5sig)
1080                 return NULL;
1081
1082         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1083                                  lockdep_sock_is_held(sk)) {
1084                 if (key->family != family)
1085                         continue;
1086                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1087                         continue;
1088                 if (family == AF_INET) {
1089                         mask = inet_make_mask(key->prefixlen);
1090                         match = (key->addr.a4.s_addr & mask) ==
1091                                 (addr->a4.s_addr & mask);
1092 #if IS_ENABLED(CONFIG_IPV6)
1093                 } else if (family == AF_INET6) {
1094                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1095                                                   key->prefixlen);
1096 #endif
1097                 } else {
1098                         match = false;
1099                 }
1100
1101                 if (match && better_md5_match(best_match, key))
1102                         best_match = key;
1103         }
1104         return best_match;
1105 }
1106 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1107
1108 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1109                                                       const union tcp_md5_addr *addr,
1110                                                       int family, u8 prefixlen,
1111                                                       int l3index, u8 flags)
1112 {
1113         const struct tcp_sock *tp = tcp_sk(sk);
1114         struct tcp_md5sig_key *key;
1115         unsigned int size = sizeof(struct in_addr);
1116         const struct tcp_md5sig_info *md5sig;
1117
1118         /* caller either holds rcu_read_lock() or socket lock */
1119         md5sig = rcu_dereference_check(tp->md5sig_info,
1120                                        lockdep_sock_is_held(sk));
1121         if (!md5sig)
1122                 return NULL;
1123 #if IS_ENABLED(CONFIG_IPV6)
1124         if (family == AF_INET6)
1125                 size = sizeof(struct in6_addr);
1126 #endif
1127         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1128                                  lockdep_sock_is_held(sk)) {
1129                 if (key->family != family)
1130                         continue;
1131                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1132                         continue;
1133                 if (key->l3index != l3index)
1134                         continue;
1135                 if (!memcmp(&key->addr, addr, size) &&
1136                     key->prefixlen == prefixlen)
1137                         return key;
1138         }
1139         return NULL;
1140 }
1141
1142 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1143                                          const struct sock *addr_sk)
1144 {
1145         const union tcp_md5_addr *addr;
1146         int l3index;
1147
1148         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1149                                                  addr_sk->sk_bound_dev_if);
1150         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1151         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1152 }
1153 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1154
1155 /* This can be called on a newly created socket, from other files */
1156 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1157                    int family, u8 prefixlen, int l3index, u8 flags,
1158                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1159 {
1160         /* Add Key to the list */
1161         struct tcp_md5sig_key *key;
1162         struct tcp_sock *tp = tcp_sk(sk);
1163         struct tcp_md5sig_info *md5sig;
1164
1165         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1166         if (key) {
1167                 /* Pre-existing entry - just update that one.
1168                  * Note that the key might be used concurrently.
1169                  * data_race() is telling kcsan that we do not care of
1170                  * key mismatches, since changing MD5 key on live flows
1171                  * can lead to packet drops.
1172                  */
1173                 data_race(memcpy(key->key, newkey, newkeylen));
1174
1175                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1176                  * Also note that a reader could catch new key->keylen value
1177                  * but old key->key[], this is the reason we use __GFP_ZERO
1178                  * at sock_kmalloc() time below these lines.
1179                  */
1180                 WRITE_ONCE(key->keylen, newkeylen);
1181
1182                 return 0;
1183         }
1184
1185         md5sig = rcu_dereference_protected(tp->md5sig_info,
1186                                            lockdep_sock_is_held(sk));
1187         if (!md5sig) {
1188                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1189                 if (!md5sig)
1190                         return -ENOMEM;
1191
1192                 sk_gso_disable(sk);
1193                 INIT_HLIST_HEAD(&md5sig->head);
1194                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1195         }
1196
1197         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1198         if (!key)
1199                 return -ENOMEM;
1200         if (!tcp_alloc_md5sig_pool()) {
1201                 sock_kfree_s(sk, key, sizeof(*key));
1202                 return -ENOMEM;
1203         }
1204
1205         memcpy(key->key, newkey, newkeylen);
1206         key->keylen = newkeylen;
1207         key->family = family;
1208         key->prefixlen = prefixlen;
1209         key->l3index = l3index;
1210         key->flags = flags;
1211         memcpy(&key->addr, addr,
1212                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1213                                                                  sizeof(struct in_addr));
1214         hlist_add_head_rcu(&key->node, &md5sig->head);
1215         return 0;
1216 }
1217 EXPORT_SYMBOL(tcp_md5_do_add);
1218
1219 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1220                    u8 prefixlen, int l3index, u8 flags)
1221 {
1222         struct tcp_md5sig_key *key;
1223
1224         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1225         if (!key)
1226                 return -ENOENT;
1227         hlist_del_rcu(&key->node);
1228         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1229         kfree_rcu(key, rcu);
1230         return 0;
1231 }
1232 EXPORT_SYMBOL(tcp_md5_do_del);
1233
1234 static void tcp_clear_md5_list(struct sock *sk)
1235 {
1236         struct tcp_sock *tp = tcp_sk(sk);
1237         struct tcp_md5sig_key *key;
1238         struct hlist_node *n;
1239         struct tcp_md5sig_info *md5sig;
1240
1241         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1242
1243         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1244                 hlist_del_rcu(&key->node);
1245                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1246                 kfree_rcu(key, rcu);
1247         }
1248 }
1249
1250 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1251                                  sockptr_t optval, int optlen)
1252 {
1253         struct tcp_md5sig cmd;
1254         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1255         const union tcp_md5_addr *addr;
1256         u8 prefixlen = 32;
1257         int l3index = 0;
1258         u8 flags;
1259
1260         if (optlen < sizeof(cmd))
1261                 return -EINVAL;
1262
1263         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1264                 return -EFAULT;
1265
1266         if (sin->sin_family != AF_INET)
1267                 return -EINVAL;
1268
1269         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1270
1271         if (optname == TCP_MD5SIG_EXT &&
1272             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1273                 prefixlen = cmd.tcpm_prefixlen;
1274                 if (prefixlen > 32)
1275                         return -EINVAL;
1276         }
1277
1278         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1279             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1280                 struct net_device *dev;
1281
1282                 rcu_read_lock();
1283                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1284                 if (dev && netif_is_l3_master(dev))
1285                         l3index = dev->ifindex;
1286
1287                 rcu_read_unlock();
1288
1289                 /* ok to reference set/not set outside of rcu;
1290                  * right now device MUST be an L3 master
1291                  */
1292                 if (!dev || !l3index)
1293                         return -EINVAL;
1294         }
1295
1296         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1297
1298         if (!cmd.tcpm_keylen)
1299                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1300
1301         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1302                 return -EINVAL;
1303
1304         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1305                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1306 }
1307
1308 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1309                                    __be32 daddr, __be32 saddr,
1310                                    const struct tcphdr *th, int nbytes)
1311 {
1312         struct tcp4_pseudohdr *bp;
1313         struct scatterlist sg;
1314         struct tcphdr *_th;
1315
1316         bp = hp->scratch;
1317         bp->saddr = saddr;
1318         bp->daddr = daddr;
1319         bp->pad = 0;
1320         bp->protocol = IPPROTO_TCP;
1321         bp->len = cpu_to_be16(nbytes);
1322
1323         _th = (struct tcphdr *)(bp + 1);
1324         memcpy(_th, th, sizeof(*th));
1325         _th->check = 0;
1326
1327         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1328         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1329                                 sizeof(*bp) + sizeof(*th));
1330         return crypto_ahash_update(hp->md5_req);
1331 }
1332
1333 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1334                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1335 {
1336         struct tcp_md5sig_pool *hp;
1337         struct ahash_request *req;
1338
1339         hp = tcp_get_md5sig_pool();
1340         if (!hp)
1341                 goto clear_hash_noput;
1342         req = hp->md5_req;
1343
1344         if (crypto_ahash_init(req))
1345                 goto clear_hash;
1346         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1347                 goto clear_hash;
1348         if (tcp_md5_hash_key(hp, key))
1349                 goto clear_hash;
1350         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1351         if (crypto_ahash_final(req))
1352                 goto clear_hash;
1353
1354         tcp_put_md5sig_pool();
1355         return 0;
1356
1357 clear_hash:
1358         tcp_put_md5sig_pool();
1359 clear_hash_noput:
1360         memset(md5_hash, 0, 16);
1361         return 1;
1362 }
1363
1364 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1365                         const struct sock *sk,
1366                         const struct sk_buff *skb)
1367 {
1368         struct tcp_md5sig_pool *hp;
1369         struct ahash_request *req;
1370         const struct tcphdr *th = tcp_hdr(skb);
1371         __be32 saddr, daddr;
1372
1373         if (sk) { /* valid for establish/request sockets */
1374                 saddr = sk->sk_rcv_saddr;
1375                 daddr = sk->sk_daddr;
1376         } else {
1377                 const struct iphdr *iph = ip_hdr(skb);
1378                 saddr = iph->saddr;
1379                 daddr = iph->daddr;
1380         }
1381
1382         hp = tcp_get_md5sig_pool();
1383         if (!hp)
1384                 goto clear_hash_noput;
1385         req = hp->md5_req;
1386
1387         if (crypto_ahash_init(req))
1388                 goto clear_hash;
1389
1390         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1391                 goto clear_hash;
1392         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1393                 goto clear_hash;
1394         if (tcp_md5_hash_key(hp, key))
1395                 goto clear_hash;
1396         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1397         if (crypto_ahash_final(req))
1398                 goto clear_hash;
1399
1400         tcp_put_md5sig_pool();
1401         return 0;
1402
1403 clear_hash:
1404         tcp_put_md5sig_pool();
1405 clear_hash_noput:
1406         memset(md5_hash, 0, 16);
1407         return 1;
1408 }
1409 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1410
1411 #endif
1412
1413 static void tcp_v4_init_req(struct request_sock *req,
1414                             const struct sock *sk_listener,
1415                             struct sk_buff *skb)
1416 {
1417         struct inet_request_sock *ireq = inet_rsk(req);
1418         struct net *net = sock_net(sk_listener);
1419
1420         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1421         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1422         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1423 }
1424
1425 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1426                                           struct sk_buff *skb,
1427                                           struct flowi *fl,
1428                                           struct request_sock *req)
1429 {
1430         tcp_v4_init_req(req, sk, skb);
1431
1432         if (security_inet_conn_request(sk, skb, req))
1433                 return NULL;
1434
1435         return inet_csk_route_req(sk, &fl->u.ip4, req);
1436 }
1437
1438 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1439         .family         =       PF_INET,
1440         .obj_size       =       sizeof(struct tcp_request_sock),
1441         .rtx_syn_ack    =       tcp_rtx_synack,
1442         .send_ack       =       tcp_v4_reqsk_send_ack,
1443         .destructor     =       tcp_v4_reqsk_destructor,
1444         .send_reset     =       tcp_v4_send_reset,
1445         .syn_ack_timeout =      tcp_syn_ack_timeout,
1446 };
1447
1448 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1449         .mss_clamp      =       TCP_MSS_DEFAULT,
1450 #ifdef CONFIG_TCP_MD5SIG
1451         .req_md5_lookup =       tcp_v4_md5_lookup,
1452         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1453 #endif
1454 #ifdef CONFIG_SYN_COOKIES
1455         .cookie_init_seq =      cookie_v4_init_sequence,
1456 #endif
1457         .route_req      =       tcp_v4_route_req,
1458         .init_seq       =       tcp_v4_init_seq,
1459         .init_ts_off    =       tcp_v4_init_ts_off,
1460         .send_synack    =       tcp_v4_send_synack,
1461 };
1462
1463 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1464 {
1465         /* Never answer to SYNs send to broadcast or multicast */
1466         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1467                 goto drop;
1468
1469         return tcp_conn_request(&tcp_request_sock_ops,
1470                                 &tcp_request_sock_ipv4_ops, sk, skb);
1471
1472 drop:
1473         tcp_listendrop(sk);
1474         return 0;
1475 }
1476 EXPORT_SYMBOL(tcp_v4_conn_request);
1477
1478
1479 /*
1480  * The three way handshake has completed - we got a valid synack -
1481  * now create the new socket.
1482  */
1483 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1484                                   struct request_sock *req,
1485                                   struct dst_entry *dst,
1486                                   struct request_sock *req_unhash,
1487                                   bool *own_req)
1488 {
1489         struct inet_request_sock *ireq;
1490         bool found_dup_sk = false;
1491         struct inet_sock *newinet;
1492         struct tcp_sock *newtp;
1493         struct sock *newsk;
1494 #ifdef CONFIG_TCP_MD5SIG
1495         const union tcp_md5_addr *addr;
1496         struct tcp_md5sig_key *key;
1497         int l3index;
1498 #endif
1499         struct ip_options_rcu *inet_opt;
1500
1501         if (sk_acceptq_is_full(sk))
1502                 goto exit_overflow;
1503
1504         newsk = tcp_create_openreq_child(sk, req, skb);
1505         if (!newsk)
1506                 goto exit_nonewsk;
1507
1508         newsk->sk_gso_type = SKB_GSO_TCPV4;
1509         inet_sk_rx_dst_set(newsk, skb);
1510
1511         newtp                 = tcp_sk(newsk);
1512         newinet               = inet_sk(newsk);
1513         ireq                  = inet_rsk(req);
1514         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1515         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1516         newsk->sk_bound_dev_if = ireq->ir_iif;
1517         newinet->inet_saddr   = ireq->ir_loc_addr;
1518         inet_opt              = rcu_dereference(ireq->ireq_opt);
1519         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1520         newinet->mc_index     = inet_iif(skb);
1521         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1522         newinet->rcv_tos      = ip_hdr(skb)->tos;
1523         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1524         if (inet_opt)
1525                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1526         newinet->inet_id = prandom_u32();
1527
1528         /* Set ToS of the new socket based upon the value of incoming SYN.
1529          * ECT bits are set later in tcp_init_transfer().
1530          */
1531         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1532                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1533
1534         if (!dst) {
1535                 dst = inet_csk_route_child_sock(sk, newsk, req);
1536                 if (!dst)
1537                         goto put_and_exit;
1538         } else {
1539                 /* syncookie case : see end of cookie_v4_check() */
1540         }
1541         sk_setup_caps(newsk, dst);
1542
1543         tcp_ca_openreq_child(newsk, dst);
1544
1545         tcp_sync_mss(newsk, dst_mtu(dst));
1546         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1547
1548         tcp_initialize_rcv_mss(newsk);
1549
1550 #ifdef CONFIG_TCP_MD5SIG
1551         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1552         /* Copy over the MD5 key from the original socket */
1553         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1554         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1555         if (key) {
1556                 /*
1557                  * We're using one, so create a matching key
1558                  * on the newsk structure. If we fail to get
1559                  * memory, then we end up not copying the key
1560                  * across. Shucks.
1561                  */
1562                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1563                                key->key, key->keylen, GFP_ATOMIC);
1564                 sk_gso_disable(newsk);
1565         }
1566 #endif
1567
1568         if (__inet_inherit_port(sk, newsk) < 0)
1569                 goto put_and_exit;
1570         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1571                                        &found_dup_sk);
1572         if (likely(*own_req)) {
1573                 tcp_move_syn(newtp, req);
1574                 ireq->ireq_opt = NULL;
1575         } else {
1576                 newinet->inet_opt = NULL;
1577
1578                 if (!req_unhash && found_dup_sk) {
1579                         /* This code path should only be executed in the
1580                          * syncookie case only
1581                          */
1582                         bh_unlock_sock(newsk);
1583                         sock_put(newsk);
1584                         newsk = NULL;
1585                 }
1586         }
1587         return newsk;
1588
1589 exit_overflow:
1590         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1591 exit_nonewsk:
1592         dst_release(dst);
1593 exit:
1594         tcp_listendrop(sk);
1595         return NULL;
1596 put_and_exit:
1597         newinet->inet_opt = NULL;
1598         inet_csk_prepare_forced_close(newsk);
1599         tcp_done(newsk);
1600         goto exit;
1601 }
1602 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1603
1604 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1605 {
1606 #ifdef CONFIG_SYN_COOKIES
1607         const struct tcphdr *th = tcp_hdr(skb);
1608
1609         if (!th->syn)
1610                 sk = cookie_v4_check(sk, skb);
1611 #endif
1612         return sk;
1613 }
1614
1615 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1616                          struct tcphdr *th, u32 *cookie)
1617 {
1618         u16 mss = 0;
1619 #ifdef CONFIG_SYN_COOKIES
1620         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1621                                     &tcp_request_sock_ipv4_ops, sk, th);
1622         if (mss) {
1623                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1624                 tcp_synq_overflow(sk);
1625         }
1626 #endif
1627         return mss;
1628 }
1629
1630 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1631                                                            u32));
1632 /* The socket must have it's spinlock held when we get
1633  * here, unless it is a TCP_LISTEN socket.
1634  *
1635  * We have a potential double-lock case here, so even when
1636  * doing backlog processing we use the BH locking scheme.
1637  * This is because we cannot sleep with the original spinlock
1638  * held.
1639  */
1640 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1641 {
1642         enum skb_drop_reason reason;
1643         struct sock *rsk;
1644
1645         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1646                 struct dst_entry *dst;
1647
1648                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1649                                                 lockdep_sock_is_held(sk));
1650
1651                 sock_rps_save_rxhash(sk, skb);
1652                 sk_mark_napi_id(sk, skb);
1653                 if (dst) {
1654                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1655                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1656                                              dst, 0)) {
1657                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1658                                 dst_release(dst);
1659                         }
1660                 }
1661                 tcp_rcv_established(sk, skb);
1662                 return 0;
1663         }
1664
1665         reason = SKB_DROP_REASON_NOT_SPECIFIED;
1666         if (tcp_checksum_complete(skb))
1667                 goto csum_err;
1668
1669         if (sk->sk_state == TCP_LISTEN) {
1670                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1671
1672                 if (!nsk)
1673                         goto discard;
1674                 if (nsk != sk) {
1675                         if (tcp_child_process(sk, nsk, skb)) {
1676                                 rsk = nsk;
1677                                 goto reset;
1678                         }
1679                         return 0;
1680                 }
1681         } else
1682                 sock_rps_save_rxhash(sk, skb);
1683
1684         if (tcp_rcv_state_process(sk, skb)) {
1685                 rsk = sk;
1686                 goto reset;
1687         }
1688         return 0;
1689
1690 reset:
1691         tcp_v4_send_reset(rsk, skb);
1692 discard:
1693         kfree_skb_reason(skb, reason);
1694         /* Be careful here. If this function gets more complicated and
1695          * gcc suffers from register pressure on the x86, sk (in %ebx)
1696          * might be destroyed here. This current version compiles correctly,
1697          * but you have been warned.
1698          */
1699         return 0;
1700
1701 csum_err:
1702         reason = SKB_DROP_REASON_TCP_CSUM;
1703         trace_tcp_bad_csum(skb);
1704         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1705         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1706         goto discard;
1707 }
1708 EXPORT_SYMBOL(tcp_v4_do_rcv);
1709
1710 int tcp_v4_early_demux(struct sk_buff *skb)
1711 {
1712         const struct iphdr *iph;
1713         const struct tcphdr *th;
1714         struct sock *sk;
1715
1716         if (skb->pkt_type != PACKET_HOST)
1717                 return 0;
1718
1719         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1720                 return 0;
1721
1722         iph = ip_hdr(skb);
1723         th = tcp_hdr(skb);
1724
1725         if (th->doff < sizeof(struct tcphdr) / 4)
1726                 return 0;
1727
1728         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1729                                        iph->saddr, th->source,
1730                                        iph->daddr, ntohs(th->dest),
1731                                        skb->skb_iif, inet_sdif(skb));
1732         if (sk) {
1733                 skb->sk = sk;
1734                 skb->destructor = sock_edemux;
1735                 if (sk_fullsock(sk)) {
1736                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1737
1738                         if (dst)
1739                                 dst = dst_check(dst, 0);
1740                         if (dst &&
1741                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1742                                 skb_dst_set_noref(skb, dst);
1743                 }
1744         }
1745         return 0;
1746 }
1747
1748 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1749                      enum skb_drop_reason *reason)
1750 {
1751         u32 limit, tail_gso_size, tail_gso_segs;
1752         struct skb_shared_info *shinfo;
1753         const struct tcphdr *th;
1754         struct tcphdr *thtail;
1755         struct sk_buff *tail;
1756         unsigned int hdrlen;
1757         bool fragstolen;
1758         u32 gso_segs;
1759         u32 gso_size;
1760         int delta;
1761
1762         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1763          * we can fix skb->truesize to its real value to avoid future drops.
1764          * This is valid because skb is not yet charged to the socket.
1765          * It has been noticed pure SACK packets were sometimes dropped
1766          * (if cooked by drivers without copybreak feature).
1767          */
1768         skb_condense(skb);
1769
1770         skb_dst_drop(skb);
1771
1772         if (unlikely(tcp_checksum_complete(skb))) {
1773                 bh_unlock_sock(sk);
1774                 trace_tcp_bad_csum(skb);
1775                 *reason = SKB_DROP_REASON_TCP_CSUM;
1776                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1777                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1778                 return true;
1779         }
1780
1781         /* Attempt coalescing to last skb in backlog, even if we are
1782          * above the limits.
1783          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1784          */
1785         th = (const struct tcphdr *)skb->data;
1786         hdrlen = th->doff * 4;
1787
1788         tail = sk->sk_backlog.tail;
1789         if (!tail)
1790                 goto no_coalesce;
1791         thtail = (struct tcphdr *)tail->data;
1792
1793         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1794             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1795             ((TCP_SKB_CB(tail)->tcp_flags |
1796               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1797             !((TCP_SKB_CB(tail)->tcp_flags &
1798               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1799             ((TCP_SKB_CB(tail)->tcp_flags ^
1800               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1801 #ifdef CONFIG_TLS_DEVICE
1802             tail->decrypted != skb->decrypted ||
1803 #endif
1804             thtail->doff != th->doff ||
1805             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1806                 goto no_coalesce;
1807
1808         __skb_pull(skb, hdrlen);
1809
1810         shinfo = skb_shinfo(skb);
1811         gso_size = shinfo->gso_size ?: skb->len;
1812         gso_segs = shinfo->gso_segs ?: 1;
1813
1814         shinfo = skb_shinfo(tail);
1815         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1816         tail_gso_segs = shinfo->gso_segs ?: 1;
1817
1818         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1819                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1820
1821                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1822                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1823                         thtail->window = th->window;
1824                 }
1825
1826                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1827                  * thtail->fin, so that the fast path in tcp_rcv_established()
1828                  * is not entered if we append a packet with a FIN.
1829                  * SYN, RST, URG are not present.
1830                  * ACK is set on both packets.
1831                  * PSH : we do not really care in TCP stack,
1832                  *       at least for 'GRO' packets.
1833                  */
1834                 thtail->fin |= th->fin;
1835                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1836
1837                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1838                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1839                         tail->tstamp = skb->tstamp;
1840                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1841                 }
1842
1843                 /* Not as strict as GRO. We only need to carry mss max value */
1844                 shinfo->gso_size = max(gso_size, tail_gso_size);
1845                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1846
1847                 sk->sk_backlog.len += delta;
1848                 __NET_INC_STATS(sock_net(sk),
1849                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1850                 kfree_skb_partial(skb, fragstolen);
1851                 return false;
1852         }
1853         __skb_push(skb, hdrlen);
1854
1855 no_coalesce:
1856         /* Only socket owner can try to collapse/prune rx queues
1857          * to reduce memory overhead, so add a little headroom here.
1858          * Few sockets backlog are possibly concurrently non empty.
1859          */
1860         limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1861
1862         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1863                 bh_unlock_sock(sk);
1864                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1865                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1866                 return true;
1867         }
1868         return false;
1869 }
1870 EXPORT_SYMBOL(tcp_add_backlog);
1871
1872 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1873 {
1874         struct tcphdr *th = (struct tcphdr *)skb->data;
1875
1876         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1877 }
1878 EXPORT_SYMBOL(tcp_filter);
1879
1880 static void tcp_v4_restore_cb(struct sk_buff *skb)
1881 {
1882         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1883                 sizeof(struct inet_skb_parm));
1884 }
1885
1886 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1887                            const struct tcphdr *th)
1888 {
1889         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1890          * barrier() makes sure compiler wont play fool^Waliasing games.
1891          */
1892         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1893                 sizeof(struct inet_skb_parm));
1894         barrier();
1895
1896         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1897         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1898                                     skb->len - th->doff * 4);
1899         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1900         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1901         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1902         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1903         TCP_SKB_CB(skb)->sacked  = 0;
1904         TCP_SKB_CB(skb)->has_rxtstamp =
1905                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1906 }
1907
1908 /*
1909  *      From tcp_input.c
1910  */
1911
1912 int tcp_v4_rcv(struct sk_buff *skb)
1913 {
1914         struct net *net = dev_net(skb->dev);
1915         enum skb_drop_reason drop_reason;
1916         int sdif = inet_sdif(skb);
1917         int dif = inet_iif(skb);
1918         const struct iphdr *iph;
1919         const struct tcphdr *th;
1920         bool refcounted;
1921         struct sock *sk;
1922         int ret;
1923
1924         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1925         if (skb->pkt_type != PACKET_HOST)
1926                 goto discard_it;
1927
1928         /* Count it even if it's bad */
1929         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1930
1931         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1932                 goto discard_it;
1933
1934         th = (const struct tcphdr *)skb->data;
1935
1936         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1937                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1938                 goto bad_packet;
1939         }
1940         if (!pskb_may_pull(skb, th->doff * 4))
1941                 goto discard_it;
1942
1943         /* An explanation is required here, I think.
1944          * Packet length and doff are validated by header prediction,
1945          * provided case of th->doff==0 is eliminated.
1946          * So, we defer the checks. */
1947
1948         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1949                 goto csum_error;
1950
1951         th = (const struct tcphdr *)skb->data;
1952         iph = ip_hdr(skb);
1953 lookup:
1954         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1955                                th->dest, sdif, &refcounted);
1956         if (!sk)
1957                 goto no_tcp_socket;
1958
1959 process:
1960         if (sk->sk_state == TCP_TIME_WAIT)
1961                 goto do_time_wait;
1962
1963         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1964                 struct request_sock *req = inet_reqsk(sk);
1965                 bool req_stolen = false;
1966                 struct sock *nsk;
1967
1968                 sk = req->rsk_listener;
1969                 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1970                         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1971                 else
1972                         drop_reason = tcp_inbound_md5_hash(sk, skb,
1973                                                    &iph->saddr, &iph->daddr,
1974                                                    AF_INET, dif, sdif);
1975                 if (unlikely(drop_reason)) {
1976                         sk_drops_add(sk, skb);
1977                         reqsk_put(req);
1978                         goto discard_it;
1979                 }
1980                 if (tcp_checksum_complete(skb)) {
1981                         reqsk_put(req);
1982                         goto csum_error;
1983                 }
1984                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1985                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
1986                         if (!nsk) {
1987                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
1988                                 goto lookup;
1989                         }
1990                         sk = nsk;
1991                         /* reuseport_migrate_sock() has already held one sk_refcnt
1992                          * before returning.
1993                          */
1994                 } else {
1995                         /* We own a reference on the listener, increase it again
1996                          * as we might lose it too soon.
1997                          */
1998                         sock_hold(sk);
1999                 }
2000                 refcounted = true;
2001                 nsk = NULL;
2002                 if (!tcp_filter(sk, skb)) {
2003                         th = (const struct tcphdr *)skb->data;
2004                         iph = ip_hdr(skb);
2005                         tcp_v4_fill_cb(skb, iph, th);
2006                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2007                 } else {
2008                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2009                 }
2010                 if (!nsk) {
2011                         reqsk_put(req);
2012                         if (req_stolen) {
2013                                 /* Another cpu got exclusive access to req
2014                                  * and created a full blown socket.
2015                                  * Try to feed this packet to this socket
2016                                  * instead of discarding it.
2017                                  */
2018                                 tcp_v4_restore_cb(skb);
2019                                 sock_put(sk);
2020                                 goto lookup;
2021                         }
2022                         goto discard_and_relse;
2023                 }
2024                 nf_reset_ct(skb);
2025                 if (nsk == sk) {
2026                         reqsk_put(req);
2027                         tcp_v4_restore_cb(skb);
2028                 } else if (tcp_child_process(sk, nsk, skb)) {
2029                         tcp_v4_send_reset(nsk, skb);
2030                         goto discard_and_relse;
2031                 } else {
2032                         sock_put(sk);
2033                         return 0;
2034                 }
2035         }
2036
2037         if (static_branch_unlikely(&ip4_min_ttl)) {
2038                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2039                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2040                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2041                         goto discard_and_relse;
2042                 }
2043         }
2044
2045         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2046                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2047                 goto discard_and_relse;
2048         }
2049
2050         drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2051                                            &iph->daddr, AF_INET, dif, sdif);
2052         if (drop_reason)
2053                 goto discard_and_relse;
2054
2055         nf_reset_ct(skb);
2056
2057         if (tcp_filter(sk, skb)) {
2058                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2059                 goto discard_and_relse;
2060         }
2061         th = (const struct tcphdr *)skb->data;
2062         iph = ip_hdr(skb);
2063         tcp_v4_fill_cb(skb, iph, th);
2064
2065         skb->dev = NULL;
2066
2067         if (sk->sk_state == TCP_LISTEN) {
2068                 ret = tcp_v4_do_rcv(sk, skb);
2069                 goto put_and_return;
2070         }
2071
2072         sk_incoming_cpu_update(sk);
2073
2074         bh_lock_sock_nested(sk);
2075         tcp_segs_in(tcp_sk(sk), skb);
2076         ret = 0;
2077         if (!sock_owned_by_user(sk)) {
2078                 ret = tcp_v4_do_rcv(sk, skb);
2079         } else {
2080                 if (tcp_add_backlog(sk, skb, &drop_reason))
2081                         goto discard_and_relse;
2082         }
2083         bh_unlock_sock(sk);
2084
2085 put_and_return:
2086         if (refcounted)
2087                 sock_put(sk);
2088
2089         return ret;
2090
2091 no_tcp_socket:
2092         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2093         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2094                 goto discard_it;
2095
2096         tcp_v4_fill_cb(skb, iph, th);
2097
2098         if (tcp_checksum_complete(skb)) {
2099 csum_error:
2100                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2101                 trace_tcp_bad_csum(skb);
2102                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2103 bad_packet:
2104                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2105         } else {
2106                 tcp_v4_send_reset(NULL, skb);
2107         }
2108
2109 discard_it:
2110         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2111         /* Discard frame. */
2112         kfree_skb_reason(skb, drop_reason);
2113         return 0;
2114
2115 discard_and_relse:
2116         sk_drops_add(sk, skb);
2117         if (refcounted)
2118                 sock_put(sk);
2119         goto discard_it;
2120
2121 do_time_wait:
2122         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2123                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2124                 inet_twsk_put(inet_twsk(sk));
2125                 goto discard_it;
2126         }
2127
2128         tcp_v4_fill_cb(skb, iph, th);
2129
2130         if (tcp_checksum_complete(skb)) {
2131                 inet_twsk_put(inet_twsk(sk));
2132                 goto csum_error;
2133         }
2134         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2135         case TCP_TW_SYN: {
2136                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2137                                                         &tcp_hashinfo, skb,
2138                                                         __tcp_hdrlen(th),
2139                                                         iph->saddr, th->source,
2140                                                         iph->daddr, th->dest,
2141                                                         inet_iif(skb),
2142                                                         sdif);
2143                 if (sk2) {
2144                         inet_twsk_deschedule_put(inet_twsk(sk));
2145                         sk = sk2;
2146                         tcp_v4_restore_cb(skb);
2147                         refcounted = false;
2148                         goto process;
2149                 }
2150         }
2151                 /* to ACK */
2152                 fallthrough;
2153         case TCP_TW_ACK:
2154                 tcp_v4_timewait_ack(sk, skb);
2155                 break;
2156         case TCP_TW_RST:
2157                 tcp_v4_send_reset(sk, skb);
2158                 inet_twsk_deschedule_put(inet_twsk(sk));
2159                 goto discard_it;
2160         case TCP_TW_SUCCESS:;
2161         }
2162         goto discard_it;
2163 }
2164
2165 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2166         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2167         .twsk_unique    = tcp_twsk_unique,
2168         .twsk_destructor= tcp_twsk_destructor,
2169 };
2170
2171 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2172 {
2173         struct dst_entry *dst = skb_dst(skb);
2174
2175         if (dst && dst_hold_safe(dst)) {
2176                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2177                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2178         }
2179 }
2180 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2181
2182 const struct inet_connection_sock_af_ops ipv4_specific = {
2183         .queue_xmit        = ip_queue_xmit,
2184         .send_check        = tcp_v4_send_check,
2185         .rebuild_header    = inet_sk_rebuild_header,
2186         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2187         .conn_request      = tcp_v4_conn_request,
2188         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2189         .net_header_len    = sizeof(struct iphdr),
2190         .setsockopt        = ip_setsockopt,
2191         .getsockopt        = ip_getsockopt,
2192         .addr2sockaddr     = inet_csk_addr2sockaddr,
2193         .sockaddr_len      = sizeof(struct sockaddr_in),
2194         .mtu_reduced       = tcp_v4_mtu_reduced,
2195 };
2196 EXPORT_SYMBOL(ipv4_specific);
2197
2198 #ifdef CONFIG_TCP_MD5SIG
2199 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2200         .md5_lookup             = tcp_v4_md5_lookup,
2201         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2202         .md5_parse              = tcp_v4_parse_md5_keys,
2203 };
2204 #endif
2205
2206 /* NOTE: A lot of things set to zero explicitly by call to
2207  *       sk_alloc() so need not be done here.
2208  */
2209 static int tcp_v4_init_sock(struct sock *sk)
2210 {
2211         struct inet_connection_sock *icsk = inet_csk(sk);
2212
2213         tcp_init_sock(sk);
2214
2215         icsk->icsk_af_ops = &ipv4_specific;
2216
2217 #ifdef CONFIG_TCP_MD5SIG
2218         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2219 #endif
2220
2221         return 0;
2222 }
2223
2224 void tcp_v4_destroy_sock(struct sock *sk)
2225 {
2226         struct tcp_sock *tp = tcp_sk(sk);
2227
2228         trace_tcp_destroy_sock(sk);
2229
2230         tcp_clear_xmit_timers(sk);
2231
2232         tcp_cleanup_congestion_control(sk);
2233
2234         tcp_cleanup_ulp(sk);
2235
2236         /* Cleanup up the write buffer. */
2237         tcp_write_queue_purge(sk);
2238
2239         /* Check if we want to disable active TFO */
2240         tcp_fastopen_active_disable_ofo_check(sk);
2241
2242         /* Cleans up our, hopefully empty, out_of_order_queue. */
2243         skb_rbtree_purge(&tp->out_of_order_queue);
2244
2245 #ifdef CONFIG_TCP_MD5SIG
2246         /* Clean up the MD5 key list, if any */
2247         if (tp->md5sig_info) {
2248                 tcp_clear_md5_list(sk);
2249                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2250                 tp->md5sig_info = NULL;
2251         }
2252 #endif
2253
2254         /* Clean up a referenced TCP bind bucket. */
2255         if (inet_csk(sk)->icsk_bind_hash)
2256                 inet_put_port(sk);
2257
2258         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2259
2260         /* If socket is aborted during connect operation */
2261         tcp_free_fastopen_req(tp);
2262         tcp_fastopen_destroy_cipher(sk);
2263         tcp_saved_syn_free(tp);
2264
2265         sk_sockets_allocated_dec(sk);
2266 }
2267 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2268
2269 #ifdef CONFIG_PROC_FS
2270 /* Proc filesystem TCP sock list dumping. */
2271
2272 static unsigned short seq_file_family(const struct seq_file *seq);
2273
2274 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2275 {
2276         unsigned short family = seq_file_family(seq);
2277
2278         /* AF_UNSPEC is used as a match all */
2279         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2280                 net_eq(sock_net(sk), seq_file_net(seq)));
2281 }
2282
2283 /* Find a non empty bucket (starting from st->bucket)
2284  * and return the first sk from it.
2285  */
2286 static void *listening_get_first(struct seq_file *seq)
2287 {
2288         struct tcp_iter_state *st = seq->private;
2289
2290         st->offset = 0;
2291         for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2292                 struct inet_listen_hashbucket *ilb2;
2293                 struct hlist_nulls_node *node;
2294                 struct sock *sk;
2295
2296                 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2297                 if (hlist_nulls_empty(&ilb2->nulls_head))
2298                         continue;
2299
2300                 spin_lock(&ilb2->lock);
2301                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2302                         if (seq_sk_match(seq, sk))
2303                                 return sk;
2304                 }
2305                 spin_unlock(&ilb2->lock);
2306         }
2307
2308         return NULL;
2309 }
2310
2311 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2312  * If "cur" is the last one in the st->bucket,
2313  * call listening_get_first() to return the first sk of the next
2314  * non empty bucket.
2315  */
2316 static void *listening_get_next(struct seq_file *seq, void *cur)
2317 {
2318         struct tcp_iter_state *st = seq->private;
2319         struct inet_listen_hashbucket *ilb2;
2320         struct hlist_nulls_node *node;
2321         struct sock *sk = cur;
2322
2323         ++st->num;
2324         ++st->offset;
2325
2326         sk = sk_nulls_next(sk);
2327         sk_nulls_for_each_from(sk, node) {
2328                 if (seq_sk_match(seq, sk))
2329                         return sk;
2330         }
2331
2332         ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2333         spin_unlock(&ilb2->lock);
2334         ++st->bucket;
2335         return listening_get_first(seq);
2336 }
2337
2338 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2339 {
2340         struct tcp_iter_state *st = seq->private;
2341         void *rc;
2342
2343         st->bucket = 0;
2344         st->offset = 0;
2345         rc = listening_get_first(seq);
2346
2347         while (rc && *pos) {
2348                 rc = listening_get_next(seq, rc);
2349                 --*pos;
2350         }
2351         return rc;
2352 }
2353
2354 static inline bool empty_bucket(const struct tcp_iter_state *st)
2355 {
2356         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2357 }
2358
2359 /*
2360  * Get first established socket starting from bucket given in st->bucket.
2361  * If st->bucket is zero, the very first socket in the hash is returned.
2362  */
2363 static void *established_get_first(struct seq_file *seq)
2364 {
2365         struct tcp_iter_state *st = seq->private;
2366
2367         st->offset = 0;
2368         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2369                 struct sock *sk;
2370                 struct hlist_nulls_node *node;
2371                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2372
2373                 /* Lockless fast path for the common case of empty buckets */
2374                 if (empty_bucket(st))
2375                         continue;
2376
2377                 spin_lock_bh(lock);
2378                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2379                         if (seq_sk_match(seq, sk))
2380                                 return sk;
2381                 }
2382                 spin_unlock_bh(lock);
2383         }
2384
2385         return NULL;
2386 }
2387
2388 static void *established_get_next(struct seq_file *seq, void *cur)
2389 {
2390         struct sock *sk = cur;
2391         struct hlist_nulls_node *node;
2392         struct tcp_iter_state *st = seq->private;
2393
2394         ++st->num;
2395         ++st->offset;
2396
2397         sk = sk_nulls_next(sk);
2398
2399         sk_nulls_for_each_from(sk, node) {
2400                 if (seq_sk_match(seq, sk))
2401                         return sk;
2402         }
2403
2404         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2405         ++st->bucket;
2406         return established_get_first(seq);
2407 }
2408
2409 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2410 {
2411         struct tcp_iter_state *st = seq->private;
2412         void *rc;
2413
2414         st->bucket = 0;
2415         rc = established_get_first(seq);
2416
2417         while (rc && pos) {
2418                 rc = established_get_next(seq, rc);
2419                 --pos;
2420         }
2421         return rc;
2422 }
2423
2424 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2425 {
2426         void *rc;
2427         struct tcp_iter_state *st = seq->private;
2428
2429         st->state = TCP_SEQ_STATE_LISTENING;
2430         rc        = listening_get_idx(seq, &pos);
2431
2432         if (!rc) {
2433                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2434                 rc        = established_get_idx(seq, pos);
2435         }
2436
2437         return rc;
2438 }
2439
2440 static void *tcp_seek_last_pos(struct seq_file *seq)
2441 {
2442         struct tcp_iter_state *st = seq->private;
2443         int bucket = st->bucket;
2444         int offset = st->offset;
2445         int orig_num = st->num;
2446         void *rc = NULL;
2447
2448         switch (st->state) {
2449         case TCP_SEQ_STATE_LISTENING:
2450                 if (st->bucket > tcp_hashinfo.lhash2_mask)
2451                         break;
2452                 st->state = TCP_SEQ_STATE_LISTENING;
2453                 rc = listening_get_first(seq);
2454                 while (offset-- && rc && bucket == st->bucket)
2455                         rc = listening_get_next(seq, rc);
2456                 if (rc)
2457                         break;
2458                 st->bucket = 0;
2459                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2460                 fallthrough;
2461         case TCP_SEQ_STATE_ESTABLISHED:
2462                 if (st->bucket > tcp_hashinfo.ehash_mask)
2463                         break;
2464                 rc = established_get_first(seq);
2465                 while (offset-- && rc && bucket == st->bucket)
2466                         rc = established_get_next(seq, rc);
2467         }
2468
2469         st->num = orig_num;
2470
2471         return rc;
2472 }
2473
2474 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2475 {
2476         struct tcp_iter_state *st = seq->private;
2477         void *rc;
2478
2479         if (*pos && *pos == st->last_pos) {
2480                 rc = tcp_seek_last_pos(seq);
2481                 if (rc)
2482                         goto out;
2483         }
2484
2485         st->state = TCP_SEQ_STATE_LISTENING;
2486         st->num = 0;
2487         st->bucket = 0;
2488         st->offset = 0;
2489         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2490
2491 out:
2492         st->last_pos = *pos;
2493         return rc;
2494 }
2495 EXPORT_SYMBOL(tcp_seq_start);
2496
2497 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2498 {
2499         struct tcp_iter_state *st = seq->private;
2500         void *rc = NULL;
2501
2502         if (v == SEQ_START_TOKEN) {
2503                 rc = tcp_get_idx(seq, 0);
2504                 goto out;
2505         }
2506
2507         switch (st->state) {
2508         case TCP_SEQ_STATE_LISTENING:
2509                 rc = listening_get_next(seq, v);
2510                 if (!rc) {
2511                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2512                         st->bucket = 0;
2513                         st->offset = 0;
2514                         rc        = established_get_first(seq);
2515                 }
2516                 break;
2517         case TCP_SEQ_STATE_ESTABLISHED:
2518                 rc = established_get_next(seq, v);
2519                 break;
2520         }
2521 out:
2522         ++*pos;
2523         st->last_pos = *pos;
2524         return rc;
2525 }
2526 EXPORT_SYMBOL(tcp_seq_next);
2527
2528 void tcp_seq_stop(struct seq_file *seq, void *v)
2529 {
2530         struct tcp_iter_state *st = seq->private;
2531
2532         switch (st->state) {
2533         case TCP_SEQ_STATE_LISTENING:
2534                 if (v != SEQ_START_TOKEN)
2535                         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2536                 break;
2537         case TCP_SEQ_STATE_ESTABLISHED:
2538                 if (v)
2539                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2540                 break;
2541         }
2542 }
2543 EXPORT_SYMBOL(tcp_seq_stop);
2544
2545 static void get_openreq4(const struct request_sock *req,
2546                          struct seq_file *f, int i)
2547 {
2548         const struct inet_request_sock *ireq = inet_rsk(req);
2549         long delta = req->rsk_timer.expires - jiffies;
2550
2551         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2552                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2553                 i,
2554                 ireq->ir_loc_addr,
2555                 ireq->ir_num,
2556                 ireq->ir_rmt_addr,
2557                 ntohs(ireq->ir_rmt_port),
2558                 TCP_SYN_RECV,
2559                 0, 0, /* could print option size, but that is af dependent. */
2560                 1,    /* timers active (only the expire timer) */
2561                 jiffies_delta_to_clock_t(delta),
2562                 req->num_timeout,
2563                 from_kuid_munged(seq_user_ns(f),
2564                                  sock_i_uid(req->rsk_listener)),
2565                 0,  /* non standard timer */
2566                 0, /* open_requests have no inode */
2567                 0,
2568                 req);
2569 }
2570
2571 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2572 {
2573         int timer_active;
2574         unsigned long timer_expires;
2575         const struct tcp_sock *tp = tcp_sk(sk);
2576         const struct inet_connection_sock *icsk = inet_csk(sk);
2577         const struct inet_sock *inet = inet_sk(sk);
2578         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2579         __be32 dest = inet->inet_daddr;
2580         __be32 src = inet->inet_rcv_saddr;
2581         __u16 destp = ntohs(inet->inet_dport);
2582         __u16 srcp = ntohs(inet->inet_sport);
2583         int rx_queue;
2584         int state;
2585
2586         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2587             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2588             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2589                 timer_active    = 1;
2590                 timer_expires   = icsk->icsk_timeout;
2591         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2592                 timer_active    = 4;
2593                 timer_expires   = icsk->icsk_timeout;
2594         } else if (timer_pending(&sk->sk_timer)) {
2595                 timer_active    = 2;
2596                 timer_expires   = sk->sk_timer.expires;
2597         } else {
2598                 timer_active    = 0;
2599                 timer_expires = jiffies;
2600         }
2601
2602         state = inet_sk_state_load(sk);
2603         if (state == TCP_LISTEN)
2604                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2605         else
2606                 /* Because we don't lock the socket,
2607                  * we might find a transient negative value.
2608                  */
2609                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2610                                       READ_ONCE(tp->copied_seq), 0);
2611
2612         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2613                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2614                 i, src, srcp, dest, destp, state,
2615                 READ_ONCE(tp->write_seq) - tp->snd_una,
2616                 rx_queue,
2617                 timer_active,
2618                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2619                 icsk->icsk_retransmits,
2620                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2621                 icsk->icsk_probes_out,
2622                 sock_i_ino(sk),
2623                 refcount_read(&sk->sk_refcnt), sk,
2624                 jiffies_to_clock_t(icsk->icsk_rto),
2625                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2626                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2627                 tcp_snd_cwnd(tp),
2628                 state == TCP_LISTEN ?
2629                     fastopenq->max_qlen :
2630                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2631 }
2632
2633 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2634                                struct seq_file *f, int i)
2635 {
2636         long delta = tw->tw_timer.expires - jiffies;
2637         __be32 dest, src;
2638         __u16 destp, srcp;
2639
2640         dest  = tw->tw_daddr;
2641         src   = tw->tw_rcv_saddr;
2642         destp = ntohs(tw->tw_dport);
2643         srcp  = ntohs(tw->tw_sport);
2644
2645         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2646                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2647                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2648                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2649                 refcount_read(&tw->tw_refcnt), tw);
2650 }
2651
2652 #define TMPSZ 150
2653
2654 static int tcp4_seq_show(struct seq_file *seq, void *v)
2655 {
2656         struct tcp_iter_state *st;
2657         struct sock *sk = v;
2658
2659         seq_setwidth(seq, TMPSZ - 1);
2660         if (v == SEQ_START_TOKEN) {
2661                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2662                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2663                            "inode");
2664                 goto out;
2665         }
2666         st = seq->private;
2667
2668         if (sk->sk_state == TCP_TIME_WAIT)
2669                 get_timewait4_sock(v, seq, st->num);
2670         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2671                 get_openreq4(v, seq, st->num);
2672         else
2673                 get_tcp4_sock(v, seq, st->num);
2674 out:
2675         seq_pad(seq, '\n');
2676         return 0;
2677 }
2678
2679 #ifdef CONFIG_BPF_SYSCALL
2680 struct bpf_tcp_iter_state {
2681         struct tcp_iter_state state;
2682         unsigned int cur_sk;
2683         unsigned int end_sk;
2684         unsigned int max_sk;
2685         struct sock **batch;
2686         bool st_bucket_done;
2687 };
2688
2689 struct bpf_iter__tcp {
2690         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2691         __bpf_md_ptr(struct sock_common *, sk_common);
2692         uid_t uid __aligned(8);
2693 };
2694
2695 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2696                              struct sock_common *sk_common, uid_t uid)
2697 {
2698         struct bpf_iter__tcp ctx;
2699
2700         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2701         ctx.meta = meta;
2702         ctx.sk_common = sk_common;
2703         ctx.uid = uid;
2704         return bpf_iter_run_prog(prog, &ctx);
2705 }
2706
2707 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2708 {
2709         while (iter->cur_sk < iter->end_sk)
2710                 sock_put(iter->batch[iter->cur_sk++]);
2711 }
2712
2713 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2714                                       unsigned int new_batch_sz)
2715 {
2716         struct sock **new_batch;
2717
2718         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2719                              GFP_USER | __GFP_NOWARN);
2720         if (!new_batch)
2721                 return -ENOMEM;
2722
2723         bpf_iter_tcp_put_batch(iter);
2724         kvfree(iter->batch);
2725         iter->batch = new_batch;
2726         iter->max_sk = new_batch_sz;
2727
2728         return 0;
2729 }
2730
2731 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2732                                                  struct sock *start_sk)
2733 {
2734         struct bpf_tcp_iter_state *iter = seq->private;
2735         struct tcp_iter_state *st = &iter->state;
2736         struct hlist_nulls_node *node;
2737         unsigned int expected = 1;
2738         struct sock *sk;
2739
2740         sock_hold(start_sk);
2741         iter->batch[iter->end_sk++] = start_sk;
2742
2743         sk = sk_nulls_next(start_sk);
2744         sk_nulls_for_each_from(sk, node) {
2745                 if (seq_sk_match(seq, sk)) {
2746                         if (iter->end_sk < iter->max_sk) {
2747                                 sock_hold(sk);
2748                                 iter->batch[iter->end_sk++] = sk;
2749                         }
2750                         expected++;
2751                 }
2752         }
2753         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2754
2755         return expected;
2756 }
2757
2758 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2759                                                    struct sock *start_sk)
2760 {
2761         struct bpf_tcp_iter_state *iter = seq->private;
2762         struct tcp_iter_state *st = &iter->state;
2763         struct hlist_nulls_node *node;
2764         unsigned int expected = 1;
2765         struct sock *sk;
2766
2767         sock_hold(start_sk);
2768         iter->batch[iter->end_sk++] = start_sk;
2769
2770         sk = sk_nulls_next(start_sk);
2771         sk_nulls_for_each_from(sk, node) {
2772                 if (seq_sk_match(seq, sk)) {
2773                         if (iter->end_sk < iter->max_sk) {
2774                                 sock_hold(sk);
2775                                 iter->batch[iter->end_sk++] = sk;
2776                         }
2777                         expected++;
2778                 }
2779         }
2780         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2781
2782         return expected;
2783 }
2784
2785 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2786 {
2787         struct bpf_tcp_iter_state *iter = seq->private;
2788         struct tcp_iter_state *st = &iter->state;
2789         unsigned int expected;
2790         bool resized = false;
2791         struct sock *sk;
2792
2793         /* The st->bucket is done.  Directly advance to the next
2794          * bucket instead of having the tcp_seek_last_pos() to skip
2795          * one by one in the current bucket and eventually find out
2796          * it has to advance to the next bucket.
2797          */
2798         if (iter->st_bucket_done) {
2799                 st->offset = 0;
2800                 st->bucket++;
2801                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2802                     st->bucket > tcp_hashinfo.lhash2_mask) {
2803                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2804                         st->bucket = 0;
2805                 }
2806         }
2807
2808 again:
2809         /* Get a new batch */
2810         iter->cur_sk = 0;
2811         iter->end_sk = 0;
2812         iter->st_bucket_done = false;
2813
2814         sk = tcp_seek_last_pos(seq);
2815         if (!sk)
2816                 return NULL; /* Done */
2817
2818         if (st->state == TCP_SEQ_STATE_LISTENING)
2819                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2820         else
2821                 expected = bpf_iter_tcp_established_batch(seq, sk);
2822
2823         if (iter->end_sk == expected) {
2824                 iter->st_bucket_done = true;
2825                 return sk;
2826         }
2827
2828         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2829                 resized = true;
2830                 goto again;
2831         }
2832
2833         return sk;
2834 }
2835
2836 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2837 {
2838         /* bpf iter does not support lseek, so it always
2839          * continue from where it was stop()-ped.
2840          */
2841         if (*pos)
2842                 return bpf_iter_tcp_batch(seq);
2843
2844         return SEQ_START_TOKEN;
2845 }
2846
2847 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2848 {
2849         struct bpf_tcp_iter_state *iter = seq->private;
2850         struct tcp_iter_state *st = &iter->state;
2851         struct sock *sk;
2852
2853         /* Whenever seq_next() is called, the iter->cur_sk is
2854          * done with seq_show(), so advance to the next sk in
2855          * the batch.
2856          */
2857         if (iter->cur_sk < iter->end_sk) {
2858                 /* Keeping st->num consistent in tcp_iter_state.
2859                  * bpf_iter_tcp does not use st->num.
2860                  * meta.seq_num is used instead.
2861                  */
2862                 st->num++;
2863                 /* Move st->offset to the next sk in the bucket such that
2864                  * the future start() will resume at st->offset in
2865                  * st->bucket.  See tcp_seek_last_pos().
2866                  */
2867                 st->offset++;
2868                 sock_put(iter->batch[iter->cur_sk++]);
2869         }
2870
2871         if (iter->cur_sk < iter->end_sk)
2872                 sk = iter->batch[iter->cur_sk];
2873         else
2874                 sk = bpf_iter_tcp_batch(seq);
2875
2876         ++*pos;
2877         /* Keeping st->last_pos consistent in tcp_iter_state.
2878          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2879          */
2880         st->last_pos = *pos;
2881         return sk;
2882 }
2883
2884 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2885 {
2886         struct bpf_iter_meta meta;
2887         struct bpf_prog *prog;
2888         struct sock *sk = v;
2889         bool slow;
2890         uid_t uid;
2891         int ret;
2892
2893         if (v == SEQ_START_TOKEN)
2894                 return 0;
2895
2896         if (sk_fullsock(sk))
2897                 slow = lock_sock_fast(sk);
2898
2899         if (unlikely(sk_unhashed(sk))) {
2900                 ret = SEQ_SKIP;
2901                 goto unlock;
2902         }
2903
2904         if (sk->sk_state == TCP_TIME_WAIT) {
2905                 uid = 0;
2906         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2907                 const struct request_sock *req = v;
2908
2909                 uid = from_kuid_munged(seq_user_ns(seq),
2910                                        sock_i_uid(req->rsk_listener));
2911         } else {
2912                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2913         }
2914
2915         meta.seq = seq;
2916         prog = bpf_iter_get_info(&meta, false);
2917         ret = tcp_prog_seq_show(prog, &meta, v, uid);
2918
2919 unlock:
2920         if (sk_fullsock(sk))
2921                 unlock_sock_fast(sk, slow);
2922         return ret;
2923
2924 }
2925
2926 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2927 {
2928         struct bpf_tcp_iter_state *iter = seq->private;
2929         struct bpf_iter_meta meta;
2930         struct bpf_prog *prog;
2931
2932         if (!v) {
2933                 meta.seq = seq;
2934                 prog = bpf_iter_get_info(&meta, true);
2935                 if (prog)
2936                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2937         }
2938
2939         if (iter->cur_sk < iter->end_sk) {
2940                 bpf_iter_tcp_put_batch(iter);
2941                 iter->st_bucket_done = false;
2942         }
2943 }
2944
2945 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2946         .show           = bpf_iter_tcp_seq_show,
2947         .start          = bpf_iter_tcp_seq_start,
2948         .next           = bpf_iter_tcp_seq_next,
2949         .stop           = bpf_iter_tcp_seq_stop,
2950 };
2951 #endif
2952 static unsigned short seq_file_family(const struct seq_file *seq)
2953 {
2954         const struct tcp_seq_afinfo *afinfo;
2955
2956 #ifdef CONFIG_BPF_SYSCALL
2957         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2958         if (seq->op == &bpf_iter_tcp_seq_ops)
2959                 return AF_UNSPEC;
2960 #endif
2961
2962         /* Iterated from proc fs */
2963         afinfo = pde_data(file_inode(seq->file));
2964         return afinfo->family;
2965 }
2966
2967 static const struct seq_operations tcp4_seq_ops = {
2968         .show           = tcp4_seq_show,
2969         .start          = tcp_seq_start,
2970         .next           = tcp_seq_next,
2971         .stop           = tcp_seq_stop,
2972 };
2973
2974 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2975         .family         = AF_INET,
2976 };
2977
2978 static int __net_init tcp4_proc_init_net(struct net *net)
2979 {
2980         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2981                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2982                 return -ENOMEM;
2983         return 0;
2984 }
2985
2986 static void __net_exit tcp4_proc_exit_net(struct net *net)
2987 {
2988         remove_proc_entry("tcp", net->proc_net);
2989 }
2990
2991 static struct pernet_operations tcp4_net_ops = {
2992         .init = tcp4_proc_init_net,
2993         .exit = tcp4_proc_exit_net,
2994 };
2995
2996 int __init tcp4_proc_init(void)
2997 {
2998         return register_pernet_subsys(&tcp4_net_ops);
2999 }
3000
3001 void tcp4_proc_exit(void)
3002 {
3003         unregister_pernet_subsys(&tcp4_net_ops);
3004 }
3005 #endif /* CONFIG_PROC_FS */
3006
3007 /* @wake is one when sk_stream_write_space() calls us.
3008  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3009  * This mimics the strategy used in sock_def_write_space().
3010  */
3011 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3012 {
3013         const struct tcp_sock *tp = tcp_sk(sk);
3014         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3015                             READ_ONCE(tp->snd_nxt);
3016
3017         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3018 }
3019 EXPORT_SYMBOL(tcp_stream_memory_free);
3020
3021 struct proto tcp_prot = {
3022         .name                   = "TCP",
3023         .owner                  = THIS_MODULE,
3024         .close                  = tcp_close,
3025         .pre_connect            = tcp_v4_pre_connect,
3026         .connect                = tcp_v4_connect,
3027         .disconnect             = tcp_disconnect,
3028         .accept                 = inet_csk_accept,
3029         .ioctl                  = tcp_ioctl,
3030         .init                   = tcp_v4_init_sock,
3031         .destroy                = tcp_v4_destroy_sock,
3032         .shutdown               = tcp_shutdown,
3033         .setsockopt             = tcp_setsockopt,
3034         .getsockopt             = tcp_getsockopt,
3035         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3036         .keepalive              = tcp_set_keepalive,
3037         .recvmsg                = tcp_recvmsg,
3038         .sendmsg                = tcp_sendmsg,
3039         .sendpage               = tcp_sendpage,
3040         .backlog_rcv            = tcp_v4_do_rcv,
3041         .release_cb             = tcp_release_cb,
3042         .hash                   = inet_hash,
3043         .unhash                 = inet_unhash,
3044         .get_port               = inet_csk_get_port,
3045         .put_port               = inet_put_port,
3046 #ifdef CONFIG_BPF_SYSCALL
3047         .psock_update_sk_prot   = tcp_bpf_update_proto,
3048 #endif
3049         .enter_memory_pressure  = tcp_enter_memory_pressure,
3050         .leave_memory_pressure  = tcp_leave_memory_pressure,
3051         .stream_memory_free     = tcp_stream_memory_free,
3052         .sockets_allocated      = &tcp_sockets_allocated,
3053         .orphan_count           = &tcp_orphan_count,
3054
3055         .memory_allocated       = &tcp_memory_allocated,
3056         .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3057
3058         .memory_pressure        = &tcp_memory_pressure,
3059         .sysctl_mem             = sysctl_tcp_mem,
3060         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3061         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3062         .max_header             = MAX_TCP_HEADER,
3063         .obj_size               = sizeof(struct tcp_sock),
3064         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3065         .twsk_prot              = &tcp_timewait_sock_ops,
3066         .rsk_prot               = &tcp_request_sock_ops,
3067         .h.hashinfo             = &tcp_hashinfo,
3068         .no_autobind            = true,
3069         .diag_destroy           = tcp_abort,
3070 };
3071 EXPORT_SYMBOL(tcp_prot);
3072
3073 static void __net_exit tcp_sk_exit(struct net *net)
3074 {
3075         struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3076
3077         if (net->ipv4.tcp_congestion_control)
3078                 bpf_module_put(net->ipv4.tcp_congestion_control,
3079                                net->ipv4.tcp_congestion_control->owner);
3080         if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3081                 kfree(tcp_death_row);
3082 }
3083
3084 static int __net_init tcp_sk_init(struct net *net)
3085 {
3086         int cnt;
3087
3088         net->ipv4.sysctl_tcp_ecn = 2;
3089         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3090
3091         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3092         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3093         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3094         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3095         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3096
3097         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3098         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3099         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3100
3101         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3102         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3103         net->ipv4.sysctl_tcp_syncookies = 1;
3104         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3105         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3106         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3107         net->ipv4.sysctl_tcp_orphan_retries = 0;
3108         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3109         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3110         net->ipv4.sysctl_tcp_tw_reuse = 2;
3111         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3112
3113         net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3114         if (!net->ipv4.tcp_death_row)
3115                 return -ENOMEM;
3116         refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3117         cnt = tcp_hashinfo.ehash_mask + 1;
3118         net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3119         net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3120
3121         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3122         net->ipv4.sysctl_tcp_sack = 1;
3123         net->ipv4.sysctl_tcp_window_scaling = 1;
3124         net->ipv4.sysctl_tcp_timestamps = 1;
3125         net->ipv4.sysctl_tcp_early_retrans = 3;
3126         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3127         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3128         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3129         net->ipv4.sysctl_tcp_max_reordering = 300;
3130         net->ipv4.sysctl_tcp_dsack = 1;
3131         net->ipv4.sysctl_tcp_app_win = 31;
3132         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3133         net->ipv4.sysctl_tcp_frto = 2;
3134         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3135         /* This limits the percentage of the congestion window which we
3136          * will allow a single TSO frame to consume.  Building TSO frames
3137          * which are too large can cause TCP streams to be bursty.
3138          */
3139         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3140         /* Default TSQ limit of 16 TSO segments */
3141         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3142         /* rfc5961 challenge ack rate limiting */
3143         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3144         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3145         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3146         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3147         net->ipv4.sysctl_tcp_autocorking = 1;
3148         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3149         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3150         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3151         if (net != &init_net) {
3152                 memcpy(net->ipv4.sysctl_tcp_rmem,
3153                        init_net.ipv4.sysctl_tcp_rmem,
3154                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3155                 memcpy(net->ipv4.sysctl_tcp_wmem,
3156                        init_net.ipv4.sysctl_tcp_wmem,
3157                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3158         }
3159         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3160         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3161         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3162         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3163         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3164         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3165
3166         /* Reno is always built in */
3167         if (!net_eq(net, &init_net) &&
3168             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3169                                init_net.ipv4.tcp_congestion_control->owner))
3170                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3171         else
3172                 net->ipv4.tcp_congestion_control = &tcp_reno;
3173
3174         return 0;
3175 }
3176
3177 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3178 {
3179         struct net *net;
3180
3181         inet_twsk_purge(&tcp_hashinfo, AF_INET);
3182
3183         list_for_each_entry(net, net_exit_list, exit_list)
3184                 tcp_fastopen_ctx_destroy(net);
3185 }
3186
3187 static struct pernet_operations __net_initdata tcp_sk_ops = {
3188        .init       = tcp_sk_init,
3189        .exit       = tcp_sk_exit,
3190        .exit_batch = tcp_sk_exit_batch,
3191 };
3192
3193 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3194 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3195                      struct sock_common *sk_common, uid_t uid)
3196
3197 #define INIT_BATCH_SZ 16
3198
3199 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3200 {
3201         struct bpf_tcp_iter_state *iter = priv_data;
3202         int err;
3203
3204         err = bpf_iter_init_seq_net(priv_data, aux);
3205         if (err)
3206                 return err;
3207
3208         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3209         if (err) {
3210                 bpf_iter_fini_seq_net(priv_data);
3211                 return err;
3212         }
3213
3214         return 0;
3215 }
3216
3217 static void bpf_iter_fini_tcp(void *priv_data)
3218 {
3219         struct bpf_tcp_iter_state *iter = priv_data;
3220
3221         bpf_iter_fini_seq_net(priv_data);
3222         kvfree(iter->batch);
3223 }
3224
3225 static const struct bpf_iter_seq_info tcp_seq_info = {
3226         .seq_ops                = &bpf_iter_tcp_seq_ops,
3227         .init_seq_private       = bpf_iter_init_tcp,
3228         .fini_seq_private       = bpf_iter_fini_tcp,
3229         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3230 };
3231
3232 static const struct bpf_func_proto *
3233 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3234                             const struct bpf_prog *prog)
3235 {
3236         switch (func_id) {
3237         case BPF_FUNC_setsockopt:
3238                 return &bpf_sk_setsockopt_proto;
3239         case BPF_FUNC_getsockopt:
3240                 return &bpf_sk_getsockopt_proto;
3241         default:
3242                 return NULL;
3243         }
3244 }
3245
3246 static struct bpf_iter_reg tcp_reg_info = {
3247         .target                 = "tcp",
3248         .ctx_arg_info_size      = 1,
3249         .ctx_arg_info           = {
3250                 { offsetof(struct bpf_iter__tcp, sk_common),
3251                   PTR_TO_BTF_ID_OR_NULL },
3252         },
3253         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3254         .seq_info               = &tcp_seq_info,
3255 };
3256
3257 static void __init bpf_iter_register(void)
3258 {
3259         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3260         if (bpf_iter_reg_target(&tcp_reg_info))
3261                 pr_warn("Warning: could not register bpf iterator tcp\n");
3262 }
3263
3264 #endif
3265
3266 void __init tcp_v4_init(void)
3267 {
3268         int cpu, res;
3269
3270         for_each_possible_cpu(cpu) {
3271                 struct sock *sk;
3272
3273                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3274                                            IPPROTO_TCP, &init_net);
3275                 if (res)
3276                         panic("Failed to create the TCP control socket.\n");
3277                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3278
3279                 /* Please enforce IP_DF and IPID==0 for RST and
3280                  * ACK sent in SYN-RECV and TIME-WAIT state.
3281                  */
3282                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3283
3284                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3285         }
3286         if (register_pernet_subsys(&tcp_sk_ops))
3287                 panic("Failed to create the TCP control socket.\n");
3288
3289 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3290         bpf_iter_register();
3291 #endif
3292 }