net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 #include <linux/btf_ids.h>
  80
  81 #include <crypto/hash.h>
  82 #include <linux/scatterlist.h>
  83
  84 #include <trace/events/tcp.h>
  85
  86 #ifdef CONFIG_TCP_MD5SIG
  87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89 #endif
  90
  91 struct inet_hashinfo tcp_hashinfo;
  92 EXPORT_SYMBOL(tcp_hashinfo);
  93
  94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
  95
  96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  97 {
  98         return secure_tcp_seq(ip_hdr(skb)->daddr,
  99                               ip_hdr(skb)->saddr,
 100                               tcp_hdr(skb)->dest,
 101                               tcp_hdr(skb)->source);
 102 }
 103
 104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 105 {
 106         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 112         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 113         struct tcp_sock *tp = tcp_sk(sk);
 114         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 115
 116         if (reuse == 2) {
 117                 /* Still does not detect *everything* that goes through
 118                  * lo, since we require a loopback src or dst address
 119                  * or direct binding to 'lo' interface.
 120                  */
 121                 bool loopback = false;
 122                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 123                         loopback = true;
 124 #if IS_ENABLED(CONFIG_IPV6)
 125                 if (tw->tw_family == AF_INET6) {
 126                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 128                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 129                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 130                                 loopback = true;
 131                 } else
 132 #endif
 133                 {
 134                         if (ipv4_is_loopback(tw->tw_daddr) ||
 135                             ipv4_is_loopback(tw->tw_rcv_saddr))
 136                                 loopback = true;
 137                 }
 138                 if (!loopback)
 139                         reuse = 0;
 140         }
 141
 142         /* With PAWS, it is safe from the viewpoint
 143            of data integrity. Even without PAWS it is safe provided sequence
 144            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 145
 146            Actually, the idea is close to VJ's one, only timestamp cache is
 147            held not per host, but per port pair and TW bucket is used as state
 148            holder.
 149
 150            If TW bucket has been already destroyed we fall back to VJ's scheme
 151            and use initial timestamp retrieved from peer table.
 152          */
 153         if (tcptw->tw_ts_recent_stamp &&
 154             (!twp || (reuse && time_after32(ktime_get_seconds(),
 155                                             tcptw->tw_ts_recent_stamp)))) {
 156                 /* In case of repair and re-using TIME-WAIT sockets we still
 157                  * want to be sure that it is safe as above but honor the
 158                  * sequence numbers and time stamps set as part of the repair
 159                  * process.
 160                  *
 161                  * Without this check re-using a TIME-WAIT socket with TCP
 162                  * repair would accumulate a -1 on the repair assigned
 163                  * sequence number. The first time it is reused the sequence
 164                  * is -1, the second time -2, etc. This fixes that issue
 165                  * without appearing to create any others.
 166                  */
 167                 if (likely(!tp->repair)) {
 168                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 169
 170                         if (!seq)
 171                                 seq = 1;
 172                         WRITE_ONCE(tp->write_seq, seq);
 173                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 174                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 175                 }
 176                 sock_hold(sktw);
 177                 return 1;
 178         }
 179
 180         return 0;
 181 }
 182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 183
 184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 185                               int addr_len)
 186 {
 187         /* This check is replicated from tcp_v4_connect() and intended to
 188          * prevent BPF program called below from accessing bytes that are out
 189          * of the bound specified by user in addr_len.
 190          */
 191         if (addr_len < sizeof(struct sockaddr_in))
 192                 return -EINVAL;
 193
 194         sock_owned_by_me(sk);
 195
 196         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 197 }
 198
 199 /* This will initiate an outgoing connection. */
 200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 201 {
 202         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 203         struct inet_sock *inet = inet_sk(sk);
 204         struct tcp_sock *tp = tcp_sk(sk);
 205         __be16 orig_sport, orig_dport;
 206         __be32 daddr, nexthop;
 207         struct flowi4 *fl4;
 208         struct rtable *rt;
 209         int err;
 210         struct ip_options_rcu *inet_opt;
 211         struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
 212
 213         if (addr_len < sizeof(struct sockaddr_in))
 214                 return -EINVAL;
 215
 216         if (usin->sin_family != AF_INET)
 217                 return -EAFNOSUPPORT;
 218
 219         nexthop = daddr = usin->sin_addr.s_addr;
 220         inet_opt = rcu_dereference_protected(inet->inet_opt,
 221                                              lockdep_sock_is_held(sk));
 222         if (inet_opt && inet_opt->opt.srr) {
 223                 if (!daddr)
 224                         return -EINVAL;
 225                 nexthop = inet_opt->opt.faddr;
 226         }
 227
 228         orig_sport = inet->inet_sport;
 229         orig_dport = usin->sin_port;
 230         fl4 = &inet->cork.fl.u.ip4;
 231         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 232                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
 233                               orig_dport, sk);
 234         if (IS_ERR(rt)) {
 235                 err = PTR_ERR(rt);
 236                 if (err == -ENETUNREACH)
 237                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 238                 return err;
 239         }
 240
 241         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 242                 ip_rt_put(rt);
 243                 return -ENETUNREACH;
 244         }
 245
 246         if (!inet_opt || !inet_opt->opt.srr)
 247                 daddr = fl4->daddr;
 248
 249         if (!inet->inet_saddr)
 250                 inet->inet_saddr = fl4->saddr;
 251         sk_rcv_saddr_set(sk, inet->inet_saddr);
 252
 253         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 254                 /* Reset inherited state */
 255                 tp->rx_opt.ts_recent       = 0;
 256                 tp->rx_opt.ts_recent_stamp = 0;
 257                 if (likely(!tp->repair))
 258                         WRITE_ONCE(tp->write_seq, 0);
 259         }
 260
 261         inet->inet_dport = usin->sin_port;
 262         sk_daddr_set(sk, daddr);
 263
 264         inet_csk(sk)->icsk_ext_hdr_len = 0;
 265         if (inet_opt)
 266                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 267
 268         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 269
 270         /* Socket identity is still unknown (sport may be zero).
 271          * However we set state to SYN-SENT and not releasing socket
 272          * lock select source port, enter ourselves into the hash tables and
 273          * complete initialization after this.
 274          */
 275         tcp_set_state(sk, TCP_SYN_SENT);
 276         err = inet_hash_connect(tcp_death_row, sk);
 277         if (err)
 278                 goto failure;
 279
 280         sk_set_txhash(sk);
 281
 282         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 283                                inet->inet_sport, inet->inet_dport, sk);
 284         if (IS_ERR(rt)) {
 285                 err = PTR_ERR(rt);
 286                 rt = NULL;
 287                 goto failure;
 288         }
 289         /* OK, now commit destination to socket.  */
 290         sk->sk_gso_type = SKB_GSO_TCPV4;
 291         sk_setup_caps(sk, &rt->dst);
 292         rt = NULL;
 293
 294         if (likely(!tp->repair)) {
 295                 if (!tp->write_seq)
 296                         WRITE_ONCE(tp->write_seq,
 297                                    secure_tcp_seq(inet->inet_saddr,
 298                                                   inet->inet_daddr,
 299                                                   inet->inet_sport,
 300                                                   usin->sin_port));
 301                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 302                                                  inet->inet_saddr,
 303                                                  inet->inet_daddr);
 304         }
 305
 306         inet->inet_id = prandom_u32();
 307
 308         if (tcp_fastopen_defer_connect(sk, &err))
 309                 return err;
 310         if (err)
 311                 goto failure;
 312
 313         err = tcp_connect(sk);
 314
 315         if (err)
 316                 goto failure;
 317
 318         return 0;
 319
 320 failure:
 321         /*
 322          * This unhashes the socket and releases the local port,
 323          * if necessary.
 324          */
 325         tcp_set_state(sk, TCP_CLOSE);
 326         ip_rt_put(rt);
 327         sk->sk_route_caps = 0;
 328         inet->inet_dport = 0;
 329         return err;
 330 }
 331 EXPORT_SYMBOL(tcp_v4_connect);
 332
 333 /*
 334  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 335  * It can be called through tcp_release_cb() if socket was owned by user
 336  * at the time tcp_v4_err() was called to handle ICMP message.
 337  */
 338 void tcp_v4_mtu_reduced(struct sock *sk)
 339 {
 340         struct inet_sock *inet = inet_sk(sk);
 341         struct dst_entry *dst;
 342         u32 mtu;
 343
 344         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 345                 return;
 346         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 347         dst = inet_csk_update_pmtu(sk, mtu);
 348         if (!dst)
 349                 return;
 350
 351         /* Something is about to be wrong... Remember soft error
 352          * for the case, if this connection will not able to recover.
 353          */
 354         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 355                 sk->sk_err_soft = EMSGSIZE;
 356
 357         mtu = dst_mtu(dst);
 358
 359         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 360             ip_sk_accept_pmtu(sk) &&
 361             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 362                 tcp_sync_mss(sk, mtu);
 363
 364                 /* Resend the TCP packet because it's
 365                  * clear that the old packet has been
 366                  * dropped. This is the new "fast" path mtu
 367                  * discovery.
 368                  */
 369                 tcp_simple_retransmit(sk);
 370         } /* else let the usual retransmit timer handle it */
 371 }
 372 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 373
 374 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 375 {
 376         struct dst_entry *dst = __sk_dst_check(sk, 0);
 377
 378         if (dst)
 379                 dst->ops->redirect(dst, sk, skb);
 380 }
 381
 382
 383 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 384 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 385 {
 386         struct request_sock *req = inet_reqsk(sk);
 387         struct net *net = sock_net(sk);
 388
 389         /* ICMPs are not backlogged, hence we cannot get
 390          * an established socket here.
 391          */
 392         if (seq != tcp_rsk(req)->snt_isn) {
 393                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 394         } else if (abort) {
 395                 /*
 396                  * Still in SYN_RECV, just remove it silently.
 397                  * There is no good way to pass the error to the newly
 398                  * created socket, and POSIX does not want network
 399                  * errors returned from accept().
 400                  */
 401                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 402                 tcp_listendrop(req->rsk_listener);
 403         }
 404         reqsk_put(req);
 405 }
 406 EXPORT_SYMBOL(tcp_req_err);
 407
 408 /* TCP-LD (RFC 6069) logic */
 409 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 410 {
 411         struct inet_connection_sock *icsk = inet_csk(sk);
 412         struct tcp_sock *tp = tcp_sk(sk);
 413         struct sk_buff *skb;
 414         s32 remaining;
 415         u32 delta_us;
 416
 417         if (sock_owned_by_user(sk))
 418                 return;
 419
 420         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 421             !icsk->icsk_backoff)
 422                 return;
 423
 424         skb = tcp_rtx_queue_head(sk);
 425         if (WARN_ON_ONCE(!skb))
 426                 return;
 427
 428         icsk->icsk_backoff--;
 429         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 430         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 431
 432         tcp_mstamp_refresh(tp);
 433         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 434         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 435
 436         if (remaining > 0) {
 437                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 438                                           remaining, TCP_RTO_MAX);
 439         } else {
 440                 /* RTO revert clocked out retransmission.
 441                  * Will retransmit now.
 442                  */
 443                 tcp_retransmit_timer(sk);
 444         }
 445 }
 446 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 447
 448 /*
 449  * This routine is called by the ICMP module when it gets some
 450  * sort of error condition.  If err < 0 then the socket should
 451  * be closed and the error returned to the user.  If err > 0
 452  * it's just the icmp type << 8 | icmp code.  After adjustment
 453  * header points to the first 8 bytes of the tcp header.  We need
 454  * to find the appropriate port.
 455  *
 456  * The locking strategy used here is very "optimistic". When
 457  * someone else accesses the socket the ICMP is just dropped
 458  * and for some paths there is no check at all.
 459  * A more general error queue to queue errors for later handling
 460  * is probably better.
 461  *
 462  */
 463
 464 int tcp_v4_err(struct sk_buff *skb, u32 info)
 465 {
 466         const struct iphdr *iph = (const struct iphdr *)skb->data;
 467         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 468         struct tcp_sock *tp;
 469         struct inet_sock *inet;
 470         const int type = icmp_hdr(skb)->type;
 471         const int code = icmp_hdr(skb)->code;
 472         struct sock *sk;
 473         struct request_sock *fastopen;
 474         u32 seq, snd_una;
 475         int err;
 476         struct net *net = dev_net(skb->dev);
 477
 478         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 479                                        th->dest, iph->saddr, ntohs(th->source),
 480                                        inet_iif(skb), 0);
 481         if (!sk) {
 482                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 483                 return -ENOENT;
 484         }
 485         if (sk->sk_state == TCP_TIME_WAIT) {
 486                 inet_twsk_put(inet_twsk(sk));
 487                 return 0;
 488         }
 489         seq = ntohl(th->seq);
 490         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 491                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 492                                      type == ICMP_TIME_EXCEEDED ||
 493                                      (type == ICMP_DEST_UNREACH &&
 494                                       (code == ICMP_NET_UNREACH ||
 495                                        code == ICMP_HOST_UNREACH)));
 496                 return 0;
 497         }
 498
 499         bh_lock_sock(sk);
 500         /* If too many ICMPs get dropped on busy
 501          * servers this needs to be solved differently.
 502          * We do take care of PMTU discovery (RFC1191) special case :
 503          * we can receive locally generated ICMP messages while socket is held.
 504          */
 505         if (sock_owned_by_user(sk)) {
 506                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 507                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 508         }
 509         if (sk->sk_state == TCP_CLOSE)
 510                 goto out;
 511
 512         if (static_branch_unlikely(&ip4_min_ttl)) {
 513                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
 514                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
 515                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 516                         goto out;
 517                 }
 518         }
 519
 520         tp = tcp_sk(sk);
 521         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 522         fastopen = rcu_dereference(tp->fastopen_rsk);
 523         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 524         if (sk->sk_state != TCP_LISTEN &&
 525             !between(seq, snd_una, tp->snd_nxt)) {
 526                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 527                 goto out;
 528         }
 529
 530         switch (type) {
 531         case ICMP_REDIRECT:
 532                 if (!sock_owned_by_user(sk))
 533                         do_redirect(skb, sk);
 534                 goto out;
 535         case ICMP_SOURCE_QUENCH:
 536                 /* Just silently ignore these. */
 537                 goto out;
 538         case ICMP_PARAMETERPROB:
 539                 err = EPROTO;
 540                 break;
 541         case ICMP_DEST_UNREACH:
 542                 if (code > NR_ICMP_UNREACH)
 543                         goto out;
 544
 545                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 546                         /* We are not interested in TCP_LISTEN and open_requests
 547                          * (SYN-ACKs send out by Linux are always <576bytes so
 548                          * they should go through unfragmented).
 549                          */
 550                         if (sk->sk_state == TCP_LISTEN)
 551                                 goto out;
 552
 553                         WRITE_ONCE(tp->mtu_info, info);
 554                         if (!sock_owned_by_user(sk)) {
 555                                 tcp_v4_mtu_reduced(sk);
 556                         } else {
 557                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 558                                         sock_hold(sk);
 559                         }
 560                         goto out;
 561                 }
 562
 563                 err = icmp_err_convert[code].errno;
 564                 /* check if this ICMP message allows revert of backoff.
 565                  * (see RFC 6069)
 566                  */
 567                 if (!fastopen &&
 568                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 569                         tcp_ld_RTO_revert(sk, seq);
 570                 break;
 571         case ICMP_TIME_EXCEEDED:
 572                 err = EHOSTUNREACH;
 573                 break;
 574         default:
 575                 goto out;
 576         }
 577
 578         switch (sk->sk_state) {
 579         case TCP_SYN_SENT:
 580         case TCP_SYN_RECV:
 581                 /* Only in fast or simultaneous open. If a fast open socket is
 582                  * already accepted it is treated as a connected one below.
 583                  */
 584                 if (fastopen && !fastopen->sk)
 585                         break;
 586
 587                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 588
 589                 if (!sock_owned_by_user(sk)) {
 590                         sk->sk_err = err;
 591
 592                         sk_error_report(sk);
 593
 594                         tcp_done(sk);
 595                 } else {
 596                         sk->sk_err_soft = err;
 597                 }
 598                 goto out;
 599         }
 600
 601         /* If we've already connected we will keep trying
 602          * until we time out, or the user gives up.
 603          *
 604          * rfc1122 4.2.3.9 allows to consider as hard errors
 605          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 606          * but it is obsoleted by pmtu discovery).
 607          *
 608          * Note, that in modern internet, where routing is unreliable
 609          * and in each dark corner broken firewalls sit, sending random
 610          * errors ordered by their masters even this two messages finally lose
 611          * their original sense (even Linux sends invalid PORT_UNREACHs)
 612          *
 613          * Now we are in compliance with RFCs.
 614          *                                                      --ANK (980905)
 615          */
 616
 617         inet = inet_sk(sk);
 618         if (!sock_owned_by_user(sk) && inet->recverr) {
 619                 sk->sk_err = err;
 620                 sk_error_report(sk);
 621         } else  { /* Only an error on timeout */
 622                 sk->sk_err_soft = err;
 623         }
 624
 625 out:
 626         bh_unlock_sock(sk);
 627         sock_put(sk);
 628         return 0;
 629 }
 630
 631 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 632 {
 633         struct tcphdr *th = tcp_hdr(skb);
 634
 635         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 636         skb->csum_start = skb_transport_header(skb) - skb->head;
 637         skb->csum_offset = offsetof(struct tcphdr, check);
 638 }
 639
 640 /* This routine computes an IPv4 TCP checksum. */
 641 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 642 {
 643         const struct inet_sock *inet = inet_sk(sk);
 644
 645         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 646 }
 647 EXPORT_SYMBOL(tcp_v4_send_check);
 648
 649 /*
 650  *      This routine will send an RST to the other tcp.
 651  *
 652  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 653  *                    for reset.
 654  *      Answer: if a packet caused RST, it is not for a socket
 655  *              existing in our system, if it is matched to a socket,
 656  *              it is just duplicate segment or bug in other side's TCP.
 657  *              So that we build reply only basing on parameters
 658  *              arrived with segment.
 659  *      Exception: precedence violation. We do not implement it in any case.
 660  */
 661
 662 #ifdef CONFIG_TCP_MD5SIG
 663 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 664 #else
 665 #define OPTION_BYTES sizeof(__be32)
 666 #endif
 667
 668 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 669 {
 670         const struct tcphdr *th = tcp_hdr(skb);
 671         struct {
 672                 struct tcphdr th;
 673                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
 674         } rep;
 675         struct ip_reply_arg arg;
 676 #ifdef CONFIG_TCP_MD5SIG
 677         struct tcp_md5sig_key *key = NULL;
 678         const __u8 *hash_location = NULL;
 679         unsigned char newhash[16];
 680         int genhash;
 681         struct sock *sk1 = NULL;
 682 #endif
 683         u64 transmit_time = 0;
 684         struct sock *ctl_sk;
 685         struct net *net;
 686
 687         /* Never send a reset in response to a reset. */
 688         if (th->rst)
 689                 return;
 690
 691         /* If sk not NULL, it means we did a successful lookup and incoming
 692          * route had to be correct. prequeue might have dropped our dst.
 693          */
 694         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 695                 return;
 696
 697         /* Swap the send and the receive. */
 698         memset(&rep, 0, sizeof(rep));
 699         rep.th.dest   = th->source;
 700         rep.th.source = th->dest;
 701         rep.th.doff   = sizeof(struct tcphdr) / 4;
 702         rep.th.rst    = 1;
 703
 704         if (th->ack) {
 705                 rep.th.seq = th->ack_seq;
 706         } else {
 707                 rep.th.ack = 1;
 708                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 709                                        skb->len - (th->doff << 2));
 710         }
 711
 712         memset(&arg, 0, sizeof(arg));
 713         arg.iov[0].iov_base = (unsigned char *)&rep;
 714         arg.iov[0].iov_len  = sizeof(rep.th);
 715
 716         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 717 #ifdef CONFIG_TCP_MD5SIG
 718         rcu_read_lock();
 719         hash_location = tcp_parse_md5sig_option(th);
 720         if (sk && sk_fullsock(sk)) {
 721                 const union tcp_md5_addr *addr;
 722                 int l3index;
 723
 724                 /* sdif set, means packet ingressed via a device
 725                  * in an L3 domain and inet_iif is set to it.
 726                  */
 727                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 728                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 729                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 730         } else if (hash_location) {
 731                 const union tcp_md5_addr *addr;
 732                 int sdif = tcp_v4_sdif(skb);
 733                 int dif = inet_iif(skb);
 734                 int l3index;
 735
 736                 /*
 737                  * active side is lost. Try to find listening socket through
 738                  * source port, and then find md5 key through listening socket.
 739                  * we are not loose security here:
 740                  * Incoming packet is checked with md5 hash with finding key,
 741                  * no RST generated if md5 hash doesn't match.
 742                  */
 743                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 744                                              ip_hdr(skb)->saddr,
 745                                              th->source, ip_hdr(skb)->daddr,
 746                                              ntohs(th->source), dif, sdif);
 747                 /* don't send rst if it can't find key */
 748                 if (!sk1)
 749                         goto out;
 750
 751                 /* sdif set, means packet ingressed via a device
 752                  * in an L3 domain and dif is set to it.
 753                  */
 754                 l3index = sdif ? dif : 0;
 755                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 756                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 757                 if (!key)
 758                         goto out;
 759
 760
 761                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 762                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 763                         goto out;
 764
 765         }
 766
 767         if (key) {
 768                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 769                                    (TCPOPT_NOP << 16) |
 770                                    (TCPOPT_MD5SIG << 8) |
 771                                    TCPOLEN_MD5SIG);
 772                 /* Update length and the length the header thinks exists */
 773                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 774                 rep.th.doff = arg.iov[0].iov_len / 4;
 775
 776                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 777                                      key, ip_hdr(skb)->saddr,
 778                                      ip_hdr(skb)->daddr, &rep.th);
 779         }
 780 #endif
 781         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 782         if (rep.opt[0] == 0) {
 783                 __be32 mrst = mptcp_reset_option(skb);
 784
 785                 if (mrst) {
 786                         rep.opt[0] = mrst;
 787                         arg.iov[0].iov_len += sizeof(mrst);
 788                         rep.th.doff = arg.iov[0].iov_len / 4;
 789                 }
 790         }
 791
 792         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 793                                       ip_hdr(skb)->saddr, /* XXX */
 794                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 795         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 796         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 797
 798         /* When socket is gone, all binding information is lost.
 799          * routing might fail in this case. No choice here, if we choose to force
 800          * input interface, we will misroute in case of asymmetric route.
 801          */
 802         if (sk) {
 803                 arg.bound_dev_if = sk->sk_bound_dev_if;
 804                 if (sk_fullsock(sk))
 805                         trace_tcp_send_reset(sk, skb);
 806         }
 807
 808         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 809                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 810
 811         arg.tos = ip_hdr(skb)->tos;
 812         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 813         local_bh_disable();
 814         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 815         sock_net_set(ctl_sk, net);
 816         if (sk) {
 817                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 818                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 819                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 820                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 821                 transmit_time = tcp_transmit_time(sk);
 822         }
 823         ip_send_unicast_reply(ctl_sk,
 824                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 825                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 826                               &arg, arg.iov[0].iov_len,
 827                               transmit_time);
 828
 829         ctl_sk->sk_mark = 0;
 830         sock_net_set(ctl_sk, &init_net);
 831         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 832         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 833         local_bh_enable();
 834
 835 #ifdef CONFIG_TCP_MD5SIG
 836 out:
 837         rcu_read_unlock();
 838 #endif
 839 }
 840
 841 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 842    outside socket context is ugly, certainly. What can I do?
 843  */
 844
 845 static void tcp_v4_send_ack(const struct sock *sk,
 846                             struct sk_buff *skb, u32 seq, u32 ack,
 847                             u32 win, u32 tsval, u32 tsecr, int oif,
 848                             struct tcp_md5sig_key *key,
 849                             int reply_flags, u8 tos)
 850 {
 851         const struct tcphdr *th = tcp_hdr(skb);
 852         struct {
 853                 struct tcphdr th;
 854                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 855 #ifdef CONFIG_TCP_MD5SIG
 856                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 857 #endif
 858                         ];
 859         } rep;
 860         struct net *net = sock_net(sk);
 861         struct ip_reply_arg arg;
 862         struct sock *ctl_sk;
 863         u64 transmit_time;
 864
 865         memset(&rep.th, 0, sizeof(struct tcphdr));
 866         memset(&arg, 0, sizeof(arg));
 867
 868         arg.iov[0].iov_base = (unsigned char *)&rep;
 869         arg.iov[0].iov_len  = sizeof(rep.th);
 870         if (tsecr) {
 871                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 872                                    (TCPOPT_TIMESTAMP << 8) |
 873                                    TCPOLEN_TIMESTAMP);
 874                 rep.opt[1] = htonl(tsval);
 875                 rep.opt[2] = htonl(tsecr);
 876                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 877         }
 878
 879         /* Swap the send and the receive. */
 880         rep.th.dest    = th->source;
 881         rep.th.source  = th->dest;
 882         rep.th.doff    = arg.iov[0].iov_len / 4;
 883         rep.th.seq     = htonl(seq);
 884         rep.th.ack_seq = htonl(ack);
 885         rep.th.ack     = 1;
 886         rep.th.window  = htons(win);
 887
 888 #ifdef CONFIG_TCP_MD5SIG
 889         if (key) {
 890                 int offset = (tsecr) ? 3 : 0;
 891
 892                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 893                                           (TCPOPT_NOP << 16) |
 894                                           (TCPOPT_MD5SIG << 8) |
 895                                           TCPOLEN_MD5SIG);
 896                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 897                 rep.th.doff = arg.iov[0].iov_len/4;
 898
 899                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 900                                     key, ip_hdr(skb)->saddr,
 901                                     ip_hdr(skb)->daddr, &rep.th);
 902         }
 903 #endif
 904         arg.flags = reply_flags;
 905         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 906                                       ip_hdr(skb)->saddr, /* XXX */
 907                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 908         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 909         if (oif)
 910                 arg.bound_dev_if = oif;
 911         arg.tos = tos;
 912         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 913         local_bh_disable();
 914         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 915         sock_net_set(ctl_sk, net);
 916         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 917                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 918         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 919                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 920         transmit_time = tcp_transmit_time(sk);
 921         ip_send_unicast_reply(ctl_sk,
 922                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 923                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 924                               &arg, arg.iov[0].iov_len,
 925                               transmit_time);
 926
 927         ctl_sk->sk_mark = 0;
 928         sock_net_set(ctl_sk, &init_net);
 929         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 930         local_bh_enable();
 931 }
 932
 933 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 934 {
 935         struct inet_timewait_sock *tw = inet_twsk(sk);
 936         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 937
 938         tcp_v4_send_ack(sk, skb,
 939                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 940                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 941                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 942                         tcptw->tw_ts_recent,
 943                         tw->tw_bound_dev_if,
 944                         tcp_twsk_md5_key(tcptw),
 945                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 946                         tw->tw_tos
 947                         );
 948
 949         inet_twsk_put(tw);
 950 }
 951
 952 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 953                                   struct request_sock *req)
 954 {
 955         const union tcp_md5_addr *addr;
 956         int l3index;
 957
 958         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 959          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 960          */
 961         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 962                                              tcp_sk(sk)->snd_nxt;
 963
 964         /* RFC 7323 2.3
 965          * The window field (SEG.WND) of every outgoing segment, with the
 966          * exception of <SYN> segments, MUST be right-shifted by
 967          * Rcv.Wind.Shift bits:
 968          */
 969         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 970         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 971         tcp_v4_send_ack(sk, skb, seq,
 972                         tcp_rsk(req)->rcv_nxt,
 973                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 974                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 975                         req->ts_recent,
 976                         0,
 977                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 978                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 979                         ip_hdr(skb)->tos);
 980 }
 981
 982 /*
 983  *      Send a SYN-ACK after having received a SYN.
 984  *      This still operates on a request_sock only, not on a big
 985  *      socket.
 986  */
 987 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 988                               struct flowi *fl,
 989                               struct request_sock *req,
 990                               struct tcp_fastopen_cookie *foc,
 991                               enum tcp_synack_type synack_type,
 992                               struct sk_buff *syn_skb)
 993 {
 994         const struct inet_request_sock *ireq = inet_rsk(req);
 995         struct flowi4 fl4;
 996         int err = -1;
 997         struct sk_buff *skb;
 998         u8 tos;
 999
1000         /* First, grab a route. */
1001         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1002                 return -1;
1003
1004         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1005
1006         if (skb) {
1007                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1008
1009                 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1010                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1011                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1012                                 inet_sk(sk)->tos;
1013
1014                 if (!INET_ECN_is_capable(tos) &&
1015                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1016                         tos |= INET_ECN_ECT_0;
1017
1018                 rcu_read_lock();
1019                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1020                                             ireq->ir_rmt_addr,
1021                                             rcu_dereference(ireq->ireq_opt),
1022                                             tos);
1023                 rcu_read_unlock();
1024                 err = net_xmit_eval(err);
1025         }
1026
1027         return err;
1028 }
1029
1030 /*
1031  *      IPv4 request_sock destructor.
1032  */
1033 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1034 {
1035         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1036 }
1037
1038 #ifdef CONFIG_TCP_MD5SIG
1039 /*
1040  * RFC2385 MD5 checksumming requires a mapping of
1041  * IP address->MD5 Key.
1042  * We need to maintain these in the sk structure.
1043  */
1044
1045 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1046 EXPORT_SYMBOL(tcp_md5_needed);
1047
1048 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1049 {
1050         if (!old)
1051                 return true;
1052
1053         /* l3index always overrides non-l3index */
1054         if (old->l3index && new->l3index == 0)
1055                 return false;
1056         if (old->l3index == 0 && new->l3index)
1057                 return true;
1058
1059         return old->prefixlen < new->prefixlen;
1060 }
1061
1062 /* Find the Key structure for an address.  */
1063 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1064                                            const union tcp_md5_addr *addr,
1065                                            int family)
1066 {
1067         const struct tcp_sock *tp = tcp_sk(sk);
1068         struct tcp_md5sig_key *key;
1069         const struct tcp_md5sig_info *md5sig;
1070         __be32 mask;
1071         struct tcp_md5sig_key *best_match = NULL;
1072         bool match;
1073
1074         /* caller either holds rcu_read_lock() or socket lock */
1075         md5sig = rcu_dereference_check(tp->md5sig_info,
1076                                        lockdep_sock_is_held(sk));
1077         if (!md5sig)
1078                 return NULL;
1079
1080         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1081                                  lockdep_sock_is_held(sk)) {
1082                 if (key->family != family)
1083                         continue;
1084                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1085                         continue;
1086                 if (family == AF_INET) {
1087                         mask = inet_make_mask(key->prefixlen);
1088                         match = (key->addr.a4.s_addr & mask) ==
1089                                 (addr->a4.s_addr & mask);
1090 #if IS_ENABLED(CONFIG_IPV6)
1091                 } else if (family == AF_INET6) {
1092                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1093                                                   key->prefixlen);
1094 #endif
1095                 } else {
1096                         match = false;
1097                 }
1098
1099                 if (match && better_md5_match(best_match, key))
1100                         best_match = key;
1101         }
1102         return best_match;
1103 }
1104 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1105
1106 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1107                                                       const union tcp_md5_addr *addr,
1108                                                       int family, u8 prefixlen,
1109                                                       int l3index, u8 flags)
1110 {
1111         const struct tcp_sock *tp = tcp_sk(sk);
1112         struct tcp_md5sig_key *key;
1113         unsigned int size = sizeof(struct in_addr);
1114         const struct tcp_md5sig_info *md5sig;
1115
1116         /* caller either holds rcu_read_lock() or socket lock */
1117         md5sig = rcu_dereference_check(tp->md5sig_info,
1118                                        lockdep_sock_is_held(sk));
1119         if (!md5sig)
1120                 return NULL;
1121 #if IS_ENABLED(CONFIG_IPV6)
1122         if (family == AF_INET6)
1123                 size = sizeof(struct in6_addr);
1124 #endif
1125         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1126                                  lockdep_sock_is_held(sk)) {
1127                 if (key->family != family)
1128                         continue;
1129                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1130                         continue;
1131                 if (key->l3index != l3index)
1132                         continue;
1133                 if (!memcmp(&key->addr, addr, size) &&
1134                     key->prefixlen == prefixlen)
1135                         return key;
1136         }
1137         return NULL;
1138 }
1139
1140 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1141                                          const struct sock *addr_sk)
1142 {
1143         const union tcp_md5_addr *addr;
1144         int l3index;
1145
1146         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1147                                                  addr_sk->sk_bound_dev_if);
1148         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1149         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1150 }
1151 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1152
1153 /* This can be called on a newly created socket, from other files */
1154 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1155                    int family, u8 prefixlen, int l3index, u8 flags,
1156                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1157 {
1158         /* Add Key to the list */
1159         struct tcp_md5sig_key *key;
1160         struct tcp_sock *tp = tcp_sk(sk);
1161         struct tcp_md5sig_info *md5sig;
1162
1163         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1164         if (key) {
1165                 /* Pre-existing entry - just update that one.
1166                  * Note that the key might be used concurrently.
1167                  * data_race() is telling kcsan that we do not care of
1168                  * key mismatches, since changing MD5 key on live flows
1169                  * can lead to packet drops.
1170                  */
1171                 data_race(memcpy(key->key, newkey, newkeylen));
1172
1173                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1174                  * Also note that a reader could catch new key->keylen value
1175                  * but old key->key[], this is the reason we use __GFP_ZERO
1176                  * at sock_kmalloc() time below these lines.
1177                  */
1178                 WRITE_ONCE(key->keylen, newkeylen);
1179
1180                 return 0;
1181         }
1182
1183         md5sig = rcu_dereference_protected(tp->md5sig_info,
1184                                            lockdep_sock_is_held(sk));
1185         if (!md5sig) {
1186                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1187                 if (!md5sig)
1188                         return -ENOMEM;
1189
1190                 sk_gso_disable(sk);
1191                 INIT_HLIST_HEAD(&md5sig->head);
1192                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1193         }
1194
1195         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1196         if (!key)
1197                 return -ENOMEM;
1198         if (!tcp_alloc_md5sig_pool()) {
1199                 sock_kfree_s(sk, key, sizeof(*key));
1200                 return -ENOMEM;
1201         }
1202
1203         memcpy(key->key, newkey, newkeylen);
1204         key->keylen = newkeylen;
1205         key->family = family;
1206         key->prefixlen = prefixlen;
1207         key->l3index = l3index;
1208         key->flags = flags;
1209         memcpy(&key->addr, addr,
1210                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1211                                                                  sizeof(struct in_addr));
1212         hlist_add_head_rcu(&key->node, &md5sig->head);
1213         return 0;
1214 }
1215 EXPORT_SYMBOL(tcp_md5_do_add);
1216
1217 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1218                    u8 prefixlen, int l3index, u8 flags)
1219 {
1220         struct tcp_md5sig_key *key;
1221
1222         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1223         if (!key)
1224                 return -ENOENT;
1225         hlist_del_rcu(&key->node);
1226         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1227         kfree_rcu(key, rcu);
1228         return 0;
1229 }
1230 EXPORT_SYMBOL(tcp_md5_do_del);
1231
1232 static void tcp_clear_md5_list(struct sock *sk)
1233 {
1234         struct tcp_sock *tp = tcp_sk(sk);
1235         struct tcp_md5sig_key *key;
1236         struct hlist_node *n;
1237         struct tcp_md5sig_info *md5sig;
1238
1239         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1240
1241         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1242                 hlist_del_rcu(&key->node);
1243                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1244                 kfree_rcu(key, rcu);
1245         }
1246 }
1247
1248 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1249                                  sockptr_t optval, int optlen)
1250 {
1251         struct tcp_md5sig cmd;
1252         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1253         const union tcp_md5_addr *addr;
1254         u8 prefixlen = 32;
1255         int l3index = 0;
1256         u8 flags;
1257
1258         if (optlen < sizeof(cmd))
1259                 return -EINVAL;
1260
1261         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1262                 return -EFAULT;
1263
1264         if (sin->sin_family != AF_INET)
1265                 return -EINVAL;
1266
1267         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1268
1269         if (optname == TCP_MD5SIG_EXT &&
1270             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1271                 prefixlen = cmd.tcpm_prefixlen;
1272                 if (prefixlen > 32)
1273                         return -EINVAL;
1274         }
1275
1276         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1277             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1278                 struct net_device *dev;
1279
1280                 rcu_read_lock();
1281                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1282                 if (dev && netif_is_l3_master(dev))
1283                         l3index = dev->ifindex;
1284
1285                 rcu_read_unlock();
1286
1287                 /* ok to reference set/not set outside of rcu;
1288                  * right now device MUST be an L3 master
1289                  */
1290                 if (!dev || !l3index)
1291                         return -EINVAL;
1292         }
1293
1294         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1295
1296         if (!cmd.tcpm_keylen)
1297                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1298
1299         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1300                 return -EINVAL;
1301
1302         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1303                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1304 }
1305
1306 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1307                                    __be32 daddr, __be32 saddr,
1308                                    const struct tcphdr *th, int nbytes)
1309 {
1310         struct tcp4_pseudohdr *bp;
1311         struct scatterlist sg;
1312         struct tcphdr *_th;
1313
1314         bp = hp->scratch;
1315         bp->saddr = saddr;
1316         bp->daddr = daddr;
1317         bp->pad = 0;
1318         bp->protocol = IPPROTO_TCP;
1319         bp->len = cpu_to_be16(nbytes);
1320
1321         _th = (struct tcphdr *)(bp + 1);
1322         memcpy(_th, th, sizeof(*th));
1323         _th->check = 0;
1324
1325         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1326         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1327                                 sizeof(*bp) + sizeof(*th));
1328         return crypto_ahash_update(hp->md5_req);
1329 }
1330
1331 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1332                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1333 {
1334         struct tcp_md5sig_pool *hp;
1335         struct ahash_request *req;
1336
1337         hp = tcp_get_md5sig_pool();
1338         if (!hp)
1339                 goto clear_hash_noput;
1340         req = hp->md5_req;
1341
1342         if (crypto_ahash_init(req))
1343                 goto clear_hash;
1344         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1345                 goto clear_hash;
1346         if (tcp_md5_hash_key(hp, key))
1347                 goto clear_hash;
1348         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1349         if (crypto_ahash_final(req))
1350                 goto clear_hash;
1351
1352         tcp_put_md5sig_pool();
1353         return 0;
1354
1355 clear_hash:
1356         tcp_put_md5sig_pool();
1357 clear_hash_noput:
1358         memset(md5_hash, 0, 16);
1359         return 1;
1360 }
1361
1362 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1363                         const struct sock *sk,
1364                         const struct sk_buff *skb)
1365 {
1366         struct tcp_md5sig_pool *hp;
1367         struct ahash_request *req;
1368         const struct tcphdr *th = tcp_hdr(skb);
1369         __be32 saddr, daddr;
1370
1371         if (sk) { /* valid for establish/request sockets */
1372                 saddr = sk->sk_rcv_saddr;
1373                 daddr = sk->sk_daddr;
1374         } else {
1375                 const struct iphdr *iph = ip_hdr(skb);
1376                 saddr = iph->saddr;
1377                 daddr = iph->daddr;
1378         }
1379
1380         hp = tcp_get_md5sig_pool();
1381         if (!hp)
1382                 goto clear_hash_noput;
1383         req = hp->md5_req;
1384
1385         if (crypto_ahash_init(req))
1386                 goto clear_hash;
1387
1388         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1389                 goto clear_hash;
1390         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1391                 goto clear_hash;
1392         if (tcp_md5_hash_key(hp, key))
1393                 goto clear_hash;
1394         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1395         if (crypto_ahash_final(req))
1396                 goto clear_hash;
1397
1398         tcp_put_md5sig_pool();
1399         return 0;
1400
1401 clear_hash:
1402         tcp_put_md5sig_pool();
1403 clear_hash_noput:
1404         memset(md5_hash, 0, 16);
1405         return 1;
1406 }
1407 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1408
1409 #endif
1410
1411 static void tcp_v4_init_req(struct request_sock *req,
1412                             const struct sock *sk_listener,
1413                             struct sk_buff *skb)
1414 {
1415         struct inet_request_sock *ireq = inet_rsk(req);
1416         struct net *net = sock_net(sk_listener);
1417
1418         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1419         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1420         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1421 }
1422
1423 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1424                                           struct sk_buff *skb,
1425                                           struct flowi *fl,
1426                                           struct request_sock *req)
1427 {
1428         tcp_v4_init_req(req, sk, skb);
1429
1430         if (security_inet_conn_request(sk, skb, req))
1431                 return NULL;
1432
1433         return inet_csk_route_req(sk, &fl->u.ip4, req);
1434 }
1435
1436 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1437         .family         =       PF_INET,
1438         .obj_size       =       sizeof(struct tcp_request_sock),
1439         .rtx_syn_ack    =       tcp_rtx_synack,
1440         .send_ack       =       tcp_v4_reqsk_send_ack,
1441         .destructor     =       tcp_v4_reqsk_destructor,
1442         .send_reset     =       tcp_v4_send_reset,
1443         .syn_ack_timeout =      tcp_syn_ack_timeout,
1444 };
1445
1446 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1447         .mss_clamp      =       TCP_MSS_DEFAULT,
1448 #ifdef CONFIG_TCP_MD5SIG
1449         .req_md5_lookup =       tcp_v4_md5_lookup,
1450         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1451 #endif
1452 #ifdef CONFIG_SYN_COOKIES
1453         .cookie_init_seq =      cookie_v4_init_sequence,
1454 #endif
1455         .route_req      =       tcp_v4_route_req,
1456         .init_seq       =       tcp_v4_init_seq,
1457         .init_ts_off    =       tcp_v4_init_ts_off,
1458         .send_synack    =       tcp_v4_send_synack,
1459 };
1460
1461 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1462 {
1463         /* Never answer to SYNs send to broadcast or multicast */
1464         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1465                 goto drop;
1466
1467         return tcp_conn_request(&tcp_request_sock_ops,
1468                                 &tcp_request_sock_ipv4_ops, sk, skb);
1469
1470 drop:
1471         tcp_listendrop(sk);
1472         return 0;
1473 }
1474 EXPORT_SYMBOL(tcp_v4_conn_request);
1475
1476
1477 /*
1478  * The three way handshake has completed - we got a valid synack -
1479  * now create the new socket.
1480  */
1481 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1482                                   struct request_sock *req,
1483                                   struct dst_entry *dst,
1484                                   struct request_sock *req_unhash,
1485                                   bool *own_req)
1486 {
1487         struct inet_request_sock *ireq;
1488         bool found_dup_sk = false;
1489         struct inet_sock *newinet;
1490         struct tcp_sock *newtp;
1491         struct sock *newsk;
1492 #ifdef CONFIG_TCP_MD5SIG
1493         const union tcp_md5_addr *addr;
1494         struct tcp_md5sig_key *key;
1495         int l3index;
1496 #endif
1497         struct ip_options_rcu *inet_opt;
1498
1499         if (sk_acceptq_is_full(sk))
1500                 goto exit_overflow;
1501
1502         newsk = tcp_create_openreq_child(sk, req, skb);
1503         if (!newsk)
1504                 goto exit_nonewsk;
1505
1506         newsk->sk_gso_type = SKB_GSO_TCPV4;
1507         inet_sk_rx_dst_set(newsk, skb);
1508
1509         newtp                 = tcp_sk(newsk);
1510         newinet               = inet_sk(newsk);
1511         ireq                  = inet_rsk(req);
1512         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1513         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1514         newsk->sk_bound_dev_if = ireq->ir_iif;
1515         newinet->inet_saddr   = ireq->ir_loc_addr;
1516         inet_opt              = rcu_dereference(ireq->ireq_opt);
1517         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1518         newinet->mc_index     = inet_iif(skb);
1519         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1520         newinet->rcv_tos      = ip_hdr(skb)->tos;
1521         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1522         if (inet_opt)
1523                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1524         newinet->inet_id = prandom_u32();
1525
1526         /* Set ToS of the new socket based upon the value of incoming SYN.
1527          * ECT bits are set later in tcp_init_transfer().
1528          */
1529         if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1530                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1531
1532         if (!dst) {
1533                 dst = inet_csk_route_child_sock(sk, newsk, req);
1534                 if (!dst)
1535                         goto put_and_exit;
1536         } else {
1537                 /* syncookie case : see end of cookie_v4_check() */
1538         }
1539         sk_setup_caps(newsk, dst);
1540
1541         tcp_ca_openreq_child(newsk, dst);
1542
1543         tcp_sync_mss(newsk, dst_mtu(dst));
1544         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1545
1546         tcp_initialize_rcv_mss(newsk);
1547
1548 #ifdef CONFIG_TCP_MD5SIG
1549         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1550         /* Copy over the MD5 key from the original socket */
1551         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1552         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1553         if (key) {
1554                 /*
1555                  * We're using one, so create a matching key
1556                  * on the newsk structure. If we fail to get
1557                  * memory, then we end up not copying the key
1558                  * across. Shucks.
1559                  */
1560                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1561                                key->key, key->keylen, GFP_ATOMIC);
1562                 sk_gso_disable(newsk);
1563         }
1564 #endif
1565
1566         if (__inet_inherit_port(sk, newsk) < 0)
1567                 goto put_and_exit;
1568         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1569                                        &found_dup_sk);
1570         if (likely(*own_req)) {
1571                 tcp_move_syn(newtp, req);
1572                 ireq->ireq_opt = NULL;
1573         } else {
1574                 newinet->inet_opt = NULL;
1575
1576                 if (!req_unhash && found_dup_sk) {
1577                         /* This code path should only be executed in the
1578                          * syncookie case only
1579                          */
1580                         bh_unlock_sock(newsk);
1581                         sock_put(newsk);
1582                         newsk = NULL;
1583                 }
1584         }
1585         return newsk;
1586
1587 exit_overflow:
1588         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1589 exit_nonewsk:
1590         dst_release(dst);
1591 exit:
1592         tcp_listendrop(sk);
1593         return NULL;
1594 put_and_exit:
1595         newinet->inet_opt = NULL;
1596         inet_csk_prepare_forced_close(newsk);
1597         tcp_done(newsk);
1598         goto exit;
1599 }
1600 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1601
1602 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1603 {
1604 #ifdef CONFIG_SYN_COOKIES
1605         const struct tcphdr *th = tcp_hdr(skb);
1606
1607         if (!th->syn)
1608                 sk = cookie_v4_check(sk, skb);
1609 #endif
1610         return sk;
1611 }
1612
1613 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1614                          struct tcphdr *th, u32 *cookie)
1615 {
1616         u16 mss = 0;
1617 #ifdef CONFIG_SYN_COOKIES
1618         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1619                                     &tcp_request_sock_ipv4_ops, sk, th);
1620         if (mss) {
1621                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1622                 tcp_synq_overflow(sk);
1623         }
1624 #endif
1625         return mss;
1626 }
1627
1628 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1629                                                            u32));
1630 /* The socket must have it's spinlock held when we get
1631  * here, unless it is a TCP_LISTEN socket.
1632  *
1633  * We have a potential double-lock case here, so even when
1634  * doing backlog processing we use the BH locking scheme.
1635  * This is because we cannot sleep with the original spinlock
1636  * held.
1637  */
1638 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1639 {
1640         enum skb_drop_reason reason;
1641         struct sock *rsk;
1642
1643         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1644                 struct dst_entry *dst;
1645
1646                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1647                                                 lockdep_sock_is_held(sk));
1648
1649                 sock_rps_save_rxhash(sk, skb);
1650                 sk_mark_napi_id(sk, skb);
1651                 if (dst) {
1652                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1653                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1654                                              dst, 0)) {
1655                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1656                                 dst_release(dst);
1657                         }
1658                 }
1659                 tcp_rcv_established(sk, skb);
1660                 return 0;
1661         }
1662
1663         reason = SKB_DROP_REASON_NOT_SPECIFIED;
1664         if (tcp_checksum_complete(skb))
1665                 goto csum_err;
1666
1667         if (sk->sk_state == TCP_LISTEN) {
1668                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1669
1670                 if (!nsk)
1671                         goto discard;
1672                 if (nsk != sk) {
1673                         if (tcp_child_process(sk, nsk, skb)) {
1674                                 rsk = nsk;
1675                                 goto reset;
1676                         }
1677                         return 0;
1678                 }
1679         } else
1680                 sock_rps_save_rxhash(sk, skb);
1681
1682         if (tcp_rcv_state_process(sk, skb)) {
1683                 rsk = sk;
1684                 goto reset;
1685         }
1686         return 0;
1687
1688 reset:
1689         tcp_v4_send_reset(rsk, skb);
1690 discard:
1691         kfree_skb_reason(skb, reason);
1692         /* Be careful here. If this function gets more complicated and
1693          * gcc suffers from register pressure on the x86, sk (in %ebx)
1694          * might be destroyed here. This current version compiles correctly,
1695          * but you have been warned.
1696          */
1697         return 0;
1698
1699 csum_err:
1700         reason = SKB_DROP_REASON_TCP_CSUM;
1701         trace_tcp_bad_csum(skb);
1702         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1703         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1704         goto discard;
1705 }
1706 EXPORT_SYMBOL(tcp_v4_do_rcv);
1707
1708 int tcp_v4_early_demux(struct sk_buff *skb)
1709 {
1710         const struct iphdr *iph;
1711         const struct tcphdr *th;
1712         struct sock *sk;
1713
1714         if (skb->pkt_type != PACKET_HOST)
1715                 return 0;
1716
1717         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1718                 return 0;
1719
1720         iph = ip_hdr(skb);
1721         th = tcp_hdr(skb);
1722
1723         if (th->doff < sizeof(struct tcphdr) / 4)
1724                 return 0;
1725
1726         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1727                                        iph->saddr, th->source,
1728                                        iph->daddr, ntohs(th->dest),
1729                                        skb->skb_iif, inet_sdif(skb));
1730         if (sk) {
1731                 skb->sk = sk;
1732                 skb->destructor = sock_edemux;
1733                 if (sk_fullsock(sk)) {
1734                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1735
1736                         if (dst)
1737                                 dst = dst_check(dst, 0);
1738                         if (dst &&
1739                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1740                                 skb_dst_set_noref(skb, dst);
1741                 }
1742         }
1743         return 0;
1744 }
1745
1746 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1747                      enum skb_drop_reason *reason)
1748 {
1749         u32 limit, tail_gso_size, tail_gso_segs;
1750         struct skb_shared_info *shinfo;
1751         const struct tcphdr *th;
1752         struct tcphdr *thtail;
1753         struct sk_buff *tail;
1754         unsigned int hdrlen;
1755         bool fragstolen;
1756         u32 gso_segs;
1757         u32 gso_size;
1758         int delta;
1759
1760         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1761          * we can fix skb->truesize to its real value to avoid future drops.
1762          * This is valid because skb is not yet charged to the socket.
1763          * It has been noticed pure SACK packets were sometimes dropped
1764          * (if cooked by drivers without copybreak feature).
1765          */
1766         skb_condense(skb);
1767
1768         skb_dst_drop(skb);
1769
1770         if (unlikely(tcp_checksum_complete(skb))) {
1771                 bh_unlock_sock(sk);
1772                 trace_tcp_bad_csum(skb);
1773                 *reason = SKB_DROP_REASON_TCP_CSUM;
1774                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1775                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1776                 return true;
1777         }
1778
1779         /* Attempt coalescing to last skb in backlog, even if we are
1780          * above the limits.
1781          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1782          */
1783         th = (const struct tcphdr *)skb->data;
1784         hdrlen = th->doff * 4;
1785
1786         tail = sk->sk_backlog.tail;
1787         if (!tail)
1788                 goto no_coalesce;
1789         thtail = (struct tcphdr *)tail->data;
1790
1791         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1792             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1793             ((TCP_SKB_CB(tail)->tcp_flags |
1794               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1795             !((TCP_SKB_CB(tail)->tcp_flags &
1796               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1797             ((TCP_SKB_CB(tail)->tcp_flags ^
1798               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1799 #ifdef CONFIG_TLS_DEVICE
1800             tail->decrypted != skb->decrypted ||
1801 #endif
1802             thtail->doff != th->doff ||
1803             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1804                 goto no_coalesce;
1805
1806         __skb_pull(skb, hdrlen);
1807
1808         shinfo = skb_shinfo(skb);
1809         gso_size = shinfo->gso_size ?: skb->len;
1810         gso_segs = shinfo->gso_segs ?: 1;
1811
1812         shinfo = skb_shinfo(tail);
1813         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1814         tail_gso_segs = shinfo->gso_segs ?: 1;
1815
1816         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1817                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1818
1819                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1820                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1821                         thtail->window = th->window;
1822                 }
1823
1824                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1825                  * thtail->fin, so that the fast path in tcp_rcv_established()
1826                  * is not entered if we append a packet with a FIN.
1827                  * SYN, RST, URG are not present.
1828                  * ACK is set on both packets.
1829                  * PSH : we do not really care in TCP stack,
1830                  *       at least for 'GRO' packets.
1831                  */
1832                 thtail->fin |= th->fin;
1833                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1834
1835                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1836                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1837                         tail->tstamp = skb->tstamp;
1838                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1839                 }
1840
1841                 /* Not as strict as GRO. We only need to carry mss max value */
1842                 shinfo->gso_size = max(gso_size, tail_gso_size);
1843                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1844
1845                 sk->sk_backlog.len += delta;
1846                 __NET_INC_STATS(sock_net(sk),
1847                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1848                 kfree_skb_partial(skb, fragstolen);
1849                 return false;
1850         }
1851         __skb_push(skb, hdrlen);
1852
1853 no_coalesce:
1854         /* Only socket owner can try to collapse/prune rx queues
1855          * to reduce memory overhead, so add a little headroom here.
1856          * Few sockets backlog are possibly concurrently non empty.
1857          */
1858         limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1859
1860         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1861                 bh_unlock_sock(sk);
1862                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1863                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1864                 return true;
1865         }
1866         return false;
1867 }
1868 EXPORT_SYMBOL(tcp_add_backlog);
1869
1870 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1871 {
1872         struct tcphdr *th = (struct tcphdr *)skb->data;
1873
1874         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1875 }
1876 EXPORT_SYMBOL(tcp_filter);
1877
1878 static void tcp_v4_restore_cb(struct sk_buff *skb)
1879 {
1880         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1881                 sizeof(struct inet_skb_parm));
1882 }
1883
1884 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1885                            const struct tcphdr *th)
1886 {
1887         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1888          * barrier() makes sure compiler wont play fool^Waliasing games.
1889          */
1890         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1891                 sizeof(struct inet_skb_parm));
1892         barrier();
1893
1894         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1895         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1896                                     skb->len - th->doff * 4);
1897         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1898         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1899         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1900         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1901         TCP_SKB_CB(skb)->sacked  = 0;
1902         TCP_SKB_CB(skb)->has_rxtstamp =
1903                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1904 }
1905
1906 /*
1907  *      From tcp_input.c
1908  */
1909
1910 int tcp_v4_rcv(struct sk_buff *skb)
1911 {
1912         struct net *net = dev_net(skb->dev);
1913         enum skb_drop_reason drop_reason;
1914         int sdif = inet_sdif(skb);
1915         int dif = inet_iif(skb);
1916         const struct iphdr *iph;
1917         const struct tcphdr *th;
1918         bool refcounted;
1919         struct sock *sk;
1920         int ret;
1921
1922         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1923         if (skb->pkt_type != PACKET_HOST)
1924                 goto discard_it;
1925
1926         /* Count it even if it's bad */
1927         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1928
1929         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1930                 goto discard_it;
1931
1932         th = (const struct tcphdr *)skb->data;
1933
1934         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1935                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1936                 goto bad_packet;
1937         }
1938         if (!pskb_may_pull(skb, th->doff * 4))
1939                 goto discard_it;
1940
1941         /* An explanation is required here, I think.
1942          * Packet length and doff are validated by header prediction,
1943          * provided case of th->doff==0 is eliminated.
1944          * So, we defer the checks. */
1945
1946         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1947                 goto csum_error;
1948
1949         th = (const struct tcphdr *)skb->data;
1950         iph = ip_hdr(skb);
1951 lookup:
1952         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1953                                th->dest, sdif, &refcounted);
1954         if (!sk)
1955                 goto no_tcp_socket;
1956
1957 process:
1958         if (sk->sk_state == TCP_TIME_WAIT)
1959                 goto do_time_wait;
1960
1961         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1962                 struct request_sock *req = inet_reqsk(sk);
1963                 bool req_stolen = false;
1964                 struct sock *nsk;
1965
1966                 sk = req->rsk_listener;
1967                 drop_reason = tcp_inbound_md5_hash(sk, skb,
1968                                                    &iph->saddr, &iph->daddr,
1969                                                    AF_INET, dif, sdif);
1970                 if (unlikely(drop_reason)) {
1971                         sk_drops_add(sk, skb);
1972                         reqsk_put(req);
1973                         goto discard_it;
1974                 }
1975                 if (tcp_checksum_complete(skb)) {
1976                         reqsk_put(req);
1977                         goto csum_error;
1978                 }
1979                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1980                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
1981                         if (!nsk) {
1982                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
1983                                 goto lookup;
1984                         }
1985                         sk = nsk;
1986                         /* reuseport_migrate_sock() has already held one sk_refcnt
1987                          * before returning.
1988                          */
1989                 } else {
1990                         /* We own a reference on the listener, increase it again
1991                          * as we might lose it too soon.
1992                          */
1993                         sock_hold(sk);
1994                 }
1995                 refcounted = true;
1996                 nsk = NULL;
1997                 if (!tcp_filter(sk, skb)) {
1998                         th = (const struct tcphdr *)skb->data;
1999                         iph = ip_hdr(skb);
2000                         tcp_v4_fill_cb(skb, iph, th);
2001                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2002                 } else {
2003                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2004                 }
2005                 if (!nsk) {
2006                         reqsk_put(req);
2007                         if (req_stolen) {
2008                                 /* Another cpu got exclusive access to req
2009                                  * and created a full blown socket.
2010                                  * Try to feed this packet to this socket
2011                                  * instead of discarding it.
2012                                  */
2013                                 tcp_v4_restore_cb(skb);
2014                                 sock_put(sk);
2015                                 goto lookup;
2016                         }
2017                         goto discard_and_relse;
2018                 }
2019                 if (nsk == sk) {
2020                         reqsk_put(req);
2021                         tcp_v4_restore_cb(skb);
2022                 } else if (tcp_child_process(sk, nsk, skb)) {
2023                         tcp_v4_send_reset(nsk, skb);
2024                         goto discard_and_relse;
2025                 } else {
2026                         sock_put(sk);
2027                         return 0;
2028                 }
2029         }
2030
2031         if (static_branch_unlikely(&ip4_min_ttl)) {
2032                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2033                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2034                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2035                         goto discard_and_relse;
2036                 }
2037         }
2038
2039         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2040                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2041                 goto discard_and_relse;
2042         }
2043
2044         drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2045                                            &iph->daddr, AF_INET, dif, sdif);
2046         if (drop_reason)
2047                 goto discard_and_relse;
2048
2049         nf_reset_ct(skb);
2050
2051         if (tcp_filter(sk, skb)) {
2052                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2053                 goto discard_and_relse;
2054         }
2055         th = (const struct tcphdr *)skb->data;
2056         iph = ip_hdr(skb);
2057         tcp_v4_fill_cb(skb, iph, th);
2058
2059         skb->dev = NULL;
2060
2061         if (sk->sk_state == TCP_LISTEN) {
2062                 ret = tcp_v4_do_rcv(sk, skb);
2063                 goto put_and_return;
2064         }
2065
2066         sk_incoming_cpu_update(sk);
2067
2068         bh_lock_sock_nested(sk);
2069         tcp_segs_in(tcp_sk(sk), skb);
2070         ret = 0;
2071         if (!sock_owned_by_user(sk)) {
2072                 ret = tcp_v4_do_rcv(sk, skb);
2073         } else {
2074                 if (tcp_add_backlog(sk, skb, &drop_reason))
2075                         goto discard_and_relse;
2076         }
2077         bh_unlock_sock(sk);
2078
2079 put_and_return:
2080         if (refcounted)
2081                 sock_put(sk);
2082
2083         return ret;
2084
2085 no_tcp_socket:
2086         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2087         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2088                 goto discard_it;
2089
2090         tcp_v4_fill_cb(skb, iph, th);
2091
2092         if (tcp_checksum_complete(skb)) {
2093 csum_error:
2094                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2095                 trace_tcp_bad_csum(skb);
2096                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2097 bad_packet:
2098                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2099         } else {
2100                 tcp_v4_send_reset(NULL, skb);
2101         }
2102
2103 discard_it:
2104         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2105         /* Discard frame. */
2106         kfree_skb_reason(skb, drop_reason);
2107         return 0;
2108
2109 discard_and_relse:
2110         sk_drops_add(sk, skb);
2111         if (refcounted)
2112                 sock_put(sk);
2113         goto discard_it;
2114
2115 do_time_wait:
2116         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2117                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2118                 inet_twsk_put(inet_twsk(sk));
2119                 goto discard_it;
2120         }
2121
2122         tcp_v4_fill_cb(skb, iph, th);
2123
2124         if (tcp_checksum_complete(skb)) {
2125                 inet_twsk_put(inet_twsk(sk));
2126                 goto csum_error;
2127         }
2128         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2129         case TCP_TW_SYN: {
2130                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2131                                                         &tcp_hashinfo, skb,
2132                                                         __tcp_hdrlen(th),
2133                                                         iph->saddr, th->source,
2134                                                         iph->daddr, th->dest,
2135                                                         inet_iif(skb),
2136                                                         sdif);
2137                 if (sk2) {
2138                         inet_twsk_deschedule_put(inet_twsk(sk));
2139                         sk = sk2;
2140                         tcp_v4_restore_cb(skb);
2141                         refcounted = false;
2142                         goto process;
2143                 }
2144         }
2145                 /* to ACK */
2146                 fallthrough;
2147         case TCP_TW_ACK:
2148                 tcp_v4_timewait_ack(sk, skb);
2149                 break;
2150         case TCP_TW_RST:
2151                 tcp_v4_send_reset(sk, skb);
2152                 inet_twsk_deschedule_put(inet_twsk(sk));
2153                 goto discard_it;
2154         case TCP_TW_SUCCESS:;
2155         }
2156         goto discard_it;
2157 }
2158
2159 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2160         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2161         .twsk_unique    = tcp_twsk_unique,
2162         .twsk_destructor= tcp_twsk_destructor,
2163 };
2164
2165 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2166 {
2167         struct dst_entry *dst = skb_dst(skb);
2168
2169         if (dst && dst_hold_safe(dst)) {
2170                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2171                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2172         }
2173 }
2174 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2175
2176 const struct inet_connection_sock_af_ops ipv4_specific = {
2177         .queue_xmit        = ip_queue_xmit,
2178         .send_check        = tcp_v4_send_check,
2179         .rebuild_header    = inet_sk_rebuild_header,
2180         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2181         .conn_request      = tcp_v4_conn_request,
2182         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2183         .net_header_len    = sizeof(struct iphdr),
2184         .setsockopt        = ip_setsockopt,
2185         .getsockopt        = ip_getsockopt,
2186         .addr2sockaddr     = inet_csk_addr2sockaddr,
2187         .sockaddr_len      = sizeof(struct sockaddr_in),
2188         .mtu_reduced       = tcp_v4_mtu_reduced,
2189 };
2190 EXPORT_SYMBOL(ipv4_specific);
2191
2192 #ifdef CONFIG_TCP_MD5SIG
2193 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2194         .md5_lookup             = tcp_v4_md5_lookup,
2195         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2196         .md5_parse              = tcp_v4_parse_md5_keys,
2197 };
2198 #endif
2199
2200 /* NOTE: A lot of things set to zero explicitly by call to
2201  *       sk_alloc() so need not be done here.
2202  */
2203 static int tcp_v4_init_sock(struct sock *sk)
2204 {
2205         struct inet_connection_sock *icsk = inet_csk(sk);
2206
2207         tcp_init_sock(sk);
2208
2209         icsk->icsk_af_ops = &ipv4_specific;
2210
2211 #ifdef CONFIG_TCP_MD5SIG
2212         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2213 #endif
2214
2215         return 0;
2216 }
2217
2218 void tcp_v4_destroy_sock(struct sock *sk)
2219 {
2220         struct tcp_sock *tp = tcp_sk(sk);
2221
2222         trace_tcp_destroy_sock(sk);
2223
2224         tcp_clear_xmit_timers(sk);
2225
2226         tcp_cleanup_congestion_control(sk);
2227
2228         tcp_cleanup_ulp(sk);
2229
2230         /* Cleanup up the write buffer. */
2231         tcp_write_queue_purge(sk);
2232
2233         /* Check if we want to disable active TFO */
2234         tcp_fastopen_active_disable_ofo_check(sk);
2235
2236         /* Cleans up our, hopefully empty, out_of_order_queue. */
2237         skb_rbtree_purge(&tp->out_of_order_queue);
2238
2239 #ifdef CONFIG_TCP_MD5SIG
2240         /* Clean up the MD5 key list, if any */
2241         if (tp->md5sig_info) {
2242                 tcp_clear_md5_list(sk);
2243                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2244                 tp->md5sig_info = NULL;
2245         }
2246 #endif
2247
2248         /* Clean up a referenced TCP bind bucket. */
2249         if (inet_csk(sk)->icsk_bind_hash)
2250                 inet_put_port(sk);
2251
2252         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2253
2254         /* If socket is aborted during connect operation */
2255         tcp_free_fastopen_req(tp);
2256         tcp_fastopen_destroy_cipher(sk);
2257         tcp_saved_syn_free(tp);
2258
2259         sk_sockets_allocated_dec(sk);
2260 }
2261 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2262
2263 #ifdef CONFIG_PROC_FS
2264 /* Proc filesystem TCP sock list dumping. */
2265
2266 static unsigned short seq_file_family(const struct seq_file *seq);
2267
2268 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2269 {
2270         unsigned short family = seq_file_family(seq);
2271
2272         /* AF_UNSPEC is used as a match all */
2273         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2274                 net_eq(sock_net(sk), seq_file_net(seq)));
2275 }
2276
2277 /* Find a non empty bucket (starting from st->bucket)
2278  * and return the first sk from it.
2279  */
2280 static void *listening_get_first(struct seq_file *seq)
2281 {
2282         struct tcp_iter_state *st = seq->private;
2283
2284         st->offset = 0;
2285         for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2286                 struct inet_listen_hashbucket *ilb2;
2287                 struct hlist_nulls_node *node;
2288                 struct sock *sk;
2289
2290                 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2291                 if (hlist_nulls_empty(&ilb2->nulls_head))
2292                         continue;
2293
2294                 spin_lock(&ilb2->lock);
2295                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2296                         if (seq_sk_match(seq, sk))
2297                                 return sk;
2298                 }
2299                 spin_unlock(&ilb2->lock);
2300         }
2301
2302         return NULL;
2303 }
2304
2305 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2306  * If "cur" is the last one in the st->bucket,
2307  * call listening_get_first() to return the first sk of the next
2308  * non empty bucket.
2309  */
2310 static void *listening_get_next(struct seq_file *seq, void *cur)
2311 {
2312         struct tcp_iter_state *st = seq->private;
2313         struct inet_listen_hashbucket *ilb2;
2314         struct hlist_nulls_node *node;
2315         struct sock *sk = cur;
2316
2317         ++st->num;
2318         ++st->offset;
2319
2320         sk = sk_nulls_next(sk);
2321         sk_nulls_for_each_from(sk, node) {
2322                 if (seq_sk_match(seq, sk))
2323                         return sk;
2324         }
2325
2326         ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2327         spin_unlock(&ilb2->lock);
2328         ++st->bucket;
2329         return listening_get_first(seq);
2330 }
2331
2332 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2333 {
2334         struct tcp_iter_state *st = seq->private;
2335         void *rc;
2336
2337         st->bucket = 0;
2338         st->offset = 0;
2339         rc = listening_get_first(seq);
2340
2341         while (rc && *pos) {
2342                 rc = listening_get_next(seq, rc);
2343                 --*pos;
2344         }
2345         return rc;
2346 }
2347
2348 static inline bool empty_bucket(const struct tcp_iter_state *st)
2349 {
2350         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2351 }
2352
2353 /*
2354  * Get first established socket starting from bucket given in st->bucket.
2355  * If st->bucket is zero, the very first socket in the hash is returned.
2356  */
2357 static void *established_get_first(struct seq_file *seq)
2358 {
2359         struct tcp_iter_state *st = seq->private;
2360
2361         st->offset = 0;
2362         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2363                 struct sock *sk;
2364                 struct hlist_nulls_node *node;
2365                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2366
2367                 /* Lockless fast path for the common case of empty buckets */
2368                 if (empty_bucket(st))
2369                         continue;
2370
2371                 spin_lock_bh(lock);
2372                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2373                         if (seq_sk_match(seq, sk))
2374                                 return sk;
2375                 }
2376                 spin_unlock_bh(lock);
2377         }
2378
2379         return NULL;
2380 }
2381
2382 static void *established_get_next(struct seq_file *seq, void *cur)
2383 {
2384         struct sock *sk = cur;
2385         struct hlist_nulls_node *node;
2386         struct tcp_iter_state *st = seq->private;
2387
2388         ++st->num;
2389         ++st->offset;
2390
2391         sk = sk_nulls_next(sk);
2392
2393         sk_nulls_for_each_from(sk, node) {
2394                 if (seq_sk_match(seq, sk))
2395                         return sk;
2396         }
2397
2398         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2399         ++st->bucket;
2400         return established_get_first(seq);
2401 }
2402
2403 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2404 {
2405         struct tcp_iter_state *st = seq->private;
2406         void *rc;
2407
2408         st->bucket = 0;
2409         rc = established_get_first(seq);
2410
2411         while (rc && pos) {
2412                 rc = established_get_next(seq, rc);
2413                 --pos;
2414         }
2415         return rc;
2416 }
2417
2418 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2419 {
2420         void *rc;
2421         struct tcp_iter_state *st = seq->private;
2422
2423         st->state = TCP_SEQ_STATE_LISTENING;
2424         rc        = listening_get_idx(seq, &pos);
2425
2426         if (!rc) {
2427                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2428                 rc        = established_get_idx(seq, pos);
2429         }
2430
2431         return rc;
2432 }
2433
2434 static void *tcp_seek_last_pos(struct seq_file *seq)
2435 {
2436         struct tcp_iter_state *st = seq->private;
2437         int bucket = st->bucket;
2438         int offset = st->offset;
2439         int orig_num = st->num;
2440         void *rc = NULL;
2441
2442         switch (st->state) {
2443         case TCP_SEQ_STATE_LISTENING:
2444                 if (st->bucket > tcp_hashinfo.lhash2_mask)
2445                         break;
2446                 st->state = TCP_SEQ_STATE_LISTENING;
2447                 rc = listening_get_first(seq);
2448                 while (offset-- && rc && bucket == st->bucket)
2449                         rc = listening_get_next(seq, rc);
2450                 if (rc)
2451                         break;
2452                 st->bucket = 0;
2453                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2454                 fallthrough;
2455         case TCP_SEQ_STATE_ESTABLISHED:
2456                 if (st->bucket > tcp_hashinfo.ehash_mask)
2457                         break;
2458                 rc = established_get_first(seq);
2459                 while (offset-- && rc && bucket == st->bucket)
2460                         rc = established_get_next(seq, rc);
2461         }
2462
2463         st->num = orig_num;
2464
2465         return rc;
2466 }
2467
2468 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2469 {
2470         struct tcp_iter_state *st = seq->private;
2471         void *rc;
2472
2473         if (*pos && *pos == st->last_pos) {
2474                 rc = tcp_seek_last_pos(seq);
2475                 if (rc)
2476                         goto out;
2477         }
2478
2479         st->state = TCP_SEQ_STATE_LISTENING;
2480         st->num = 0;
2481         st->bucket = 0;
2482         st->offset = 0;
2483         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2484
2485 out:
2486         st->last_pos = *pos;
2487         return rc;
2488 }
2489 EXPORT_SYMBOL(tcp_seq_start);
2490
2491 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2492 {
2493         struct tcp_iter_state *st = seq->private;
2494         void *rc = NULL;
2495
2496         if (v == SEQ_START_TOKEN) {
2497                 rc = tcp_get_idx(seq, 0);
2498                 goto out;
2499         }
2500
2501         switch (st->state) {
2502         case TCP_SEQ_STATE_LISTENING:
2503                 rc = listening_get_next(seq, v);
2504                 if (!rc) {
2505                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2506                         st->bucket = 0;
2507                         st->offset = 0;
2508                         rc        = established_get_first(seq);
2509                 }
2510                 break;
2511         case TCP_SEQ_STATE_ESTABLISHED:
2512                 rc = established_get_next(seq, v);
2513                 break;
2514         }
2515 out:
2516         ++*pos;
2517         st->last_pos = *pos;
2518         return rc;
2519 }
2520 EXPORT_SYMBOL(tcp_seq_next);
2521
2522 void tcp_seq_stop(struct seq_file *seq, void *v)
2523 {
2524         struct tcp_iter_state *st = seq->private;
2525
2526         switch (st->state) {
2527         case TCP_SEQ_STATE_LISTENING:
2528                 if (v != SEQ_START_TOKEN)
2529                         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2530                 break;
2531         case TCP_SEQ_STATE_ESTABLISHED:
2532                 if (v)
2533                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2534                 break;
2535         }
2536 }
2537 EXPORT_SYMBOL(tcp_seq_stop);
2538
2539 static void get_openreq4(const struct request_sock *req,
2540                          struct seq_file *f, int i)
2541 {
2542         const struct inet_request_sock *ireq = inet_rsk(req);
2543         long delta = req->rsk_timer.expires - jiffies;
2544
2545         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2546                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2547                 i,
2548                 ireq->ir_loc_addr,
2549                 ireq->ir_num,
2550                 ireq->ir_rmt_addr,
2551                 ntohs(ireq->ir_rmt_port),
2552                 TCP_SYN_RECV,
2553                 0, 0, /* could print option size, but that is af dependent. */
2554                 1,    /* timers active (only the expire timer) */
2555                 jiffies_delta_to_clock_t(delta),
2556                 req->num_timeout,
2557                 from_kuid_munged(seq_user_ns(f),
2558                                  sock_i_uid(req->rsk_listener)),
2559                 0,  /* non standard timer */
2560                 0, /* open_requests have no inode */
2561                 0,
2562                 req);
2563 }
2564
2565 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2566 {
2567         int timer_active;
2568         unsigned long timer_expires;
2569         const struct tcp_sock *tp = tcp_sk(sk);
2570         const struct inet_connection_sock *icsk = inet_csk(sk);
2571         const struct inet_sock *inet = inet_sk(sk);
2572         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2573         __be32 dest = inet->inet_daddr;
2574         __be32 src = inet->inet_rcv_saddr;
2575         __u16 destp = ntohs(inet->inet_dport);
2576         __u16 srcp = ntohs(inet->inet_sport);
2577         int rx_queue;
2578         int state;
2579
2580         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2581             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2582             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2583                 timer_active    = 1;
2584                 timer_expires   = icsk->icsk_timeout;
2585         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2586                 timer_active    = 4;
2587                 timer_expires   = icsk->icsk_timeout;
2588         } else if (timer_pending(&sk->sk_timer)) {
2589                 timer_active    = 2;
2590                 timer_expires   = sk->sk_timer.expires;
2591         } else {
2592                 timer_active    = 0;
2593                 timer_expires = jiffies;
2594         }
2595
2596         state = inet_sk_state_load(sk);
2597         if (state == TCP_LISTEN)
2598                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2599         else
2600                 /* Because we don't lock the socket,
2601                  * we might find a transient negative value.
2602                  */
2603                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2604                                       READ_ONCE(tp->copied_seq), 0);
2605
2606         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2607                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2608                 i, src, srcp, dest, destp, state,
2609                 READ_ONCE(tp->write_seq) - tp->snd_una,
2610                 rx_queue,
2611                 timer_active,
2612                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2613                 icsk->icsk_retransmits,
2614                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2615                 icsk->icsk_probes_out,
2616                 sock_i_ino(sk),
2617                 refcount_read(&sk->sk_refcnt), sk,
2618                 jiffies_to_clock_t(icsk->icsk_rto),
2619                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2620                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2621                 tcp_snd_cwnd(tp),
2622                 state == TCP_LISTEN ?
2623                     fastopenq->max_qlen :
2624                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2625 }
2626
2627 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2628                                struct seq_file *f, int i)
2629 {
2630         long delta = tw->tw_timer.expires - jiffies;
2631         __be32 dest, src;
2632         __u16 destp, srcp;
2633
2634         dest  = tw->tw_daddr;
2635         src   = tw->tw_rcv_saddr;
2636         destp = ntohs(tw->tw_dport);
2637         srcp  = ntohs(tw->tw_sport);
2638
2639         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2640                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2641                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2642                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2643                 refcount_read(&tw->tw_refcnt), tw);
2644 }
2645
2646 #define TMPSZ 150
2647
2648 static int tcp4_seq_show(struct seq_file *seq, void *v)
2649 {
2650         struct tcp_iter_state *st;
2651         struct sock *sk = v;
2652
2653         seq_setwidth(seq, TMPSZ - 1);
2654         if (v == SEQ_START_TOKEN) {
2655                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2656                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2657                            "inode");
2658                 goto out;
2659         }
2660         st = seq->private;
2661
2662         if (sk->sk_state == TCP_TIME_WAIT)
2663                 get_timewait4_sock(v, seq, st->num);
2664         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2665                 get_openreq4(v, seq, st->num);
2666         else
2667                 get_tcp4_sock(v, seq, st->num);
2668 out:
2669         seq_pad(seq, '\n');
2670         return 0;
2671 }
2672
2673 #ifdef CONFIG_BPF_SYSCALL
2674 struct bpf_tcp_iter_state {
2675         struct tcp_iter_state state;
2676         unsigned int cur_sk;
2677         unsigned int end_sk;
2678         unsigned int max_sk;
2679         struct sock **batch;
2680         bool st_bucket_done;
2681 };
2682
2683 struct bpf_iter__tcp {
2684         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2685         __bpf_md_ptr(struct sock_common *, sk_common);
2686         uid_t uid __aligned(8);
2687 };
2688
2689 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2690                              struct sock_common *sk_common, uid_t uid)
2691 {
2692         struct bpf_iter__tcp ctx;
2693
2694         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2695         ctx.meta = meta;
2696         ctx.sk_common = sk_common;
2697         ctx.uid = uid;
2698         return bpf_iter_run_prog(prog, &ctx);
2699 }
2700
2701 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2702 {
2703         while (iter->cur_sk < iter->end_sk)
2704                 sock_put(iter->batch[iter->cur_sk++]);
2705 }
2706
2707 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2708                                       unsigned int new_batch_sz)
2709 {
2710         struct sock **new_batch;
2711
2712         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2713                              GFP_USER | __GFP_NOWARN);
2714         if (!new_batch)
2715                 return -ENOMEM;
2716
2717         bpf_iter_tcp_put_batch(iter);
2718         kvfree(iter->batch);
2719         iter->batch = new_batch;
2720         iter->max_sk = new_batch_sz;
2721
2722         return 0;
2723 }
2724
2725 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2726                                                  struct sock *start_sk)
2727 {
2728         struct bpf_tcp_iter_state *iter = seq->private;
2729         struct tcp_iter_state *st = &iter->state;
2730         struct hlist_nulls_node *node;
2731         unsigned int expected = 1;
2732         struct sock *sk;
2733
2734         sock_hold(start_sk);
2735         iter->batch[iter->end_sk++] = start_sk;
2736
2737         sk = sk_nulls_next(start_sk);
2738         sk_nulls_for_each_from(sk, node) {
2739                 if (seq_sk_match(seq, sk)) {
2740                         if (iter->end_sk < iter->max_sk) {
2741                                 sock_hold(sk);
2742                                 iter->batch[iter->end_sk++] = sk;
2743                         }
2744                         expected++;
2745                 }
2746         }
2747         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2748
2749         return expected;
2750 }
2751
2752 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2753                                                    struct sock *start_sk)
2754 {
2755         struct bpf_tcp_iter_state *iter = seq->private;
2756         struct tcp_iter_state *st = &iter->state;
2757         struct hlist_nulls_node *node;
2758         unsigned int expected = 1;
2759         struct sock *sk;
2760
2761         sock_hold(start_sk);
2762         iter->batch[iter->end_sk++] = start_sk;
2763
2764         sk = sk_nulls_next(start_sk);
2765         sk_nulls_for_each_from(sk, node) {
2766                 if (seq_sk_match(seq, sk)) {
2767                         if (iter->end_sk < iter->max_sk) {
2768                                 sock_hold(sk);
2769                                 iter->batch[iter->end_sk++] = sk;
2770                         }
2771                         expected++;
2772                 }
2773         }
2774         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2775
2776         return expected;
2777 }
2778
2779 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2780 {
2781         struct bpf_tcp_iter_state *iter = seq->private;
2782         struct tcp_iter_state *st = &iter->state;
2783         unsigned int expected;
2784         bool resized = false;
2785         struct sock *sk;
2786
2787         /* The st->bucket is done.  Directly advance to the next
2788          * bucket instead of having the tcp_seek_last_pos() to skip
2789          * one by one in the current bucket and eventually find out
2790          * it has to advance to the next bucket.
2791          */
2792         if (iter->st_bucket_done) {
2793                 st->offset = 0;
2794                 st->bucket++;
2795                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2796                     st->bucket > tcp_hashinfo.lhash2_mask) {
2797                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2798                         st->bucket = 0;
2799                 }
2800         }
2801
2802 again:
2803         /* Get a new batch */
2804         iter->cur_sk = 0;
2805         iter->end_sk = 0;
2806         iter->st_bucket_done = false;
2807
2808         sk = tcp_seek_last_pos(seq);
2809         if (!sk)
2810                 return NULL; /* Done */
2811
2812         if (st->state == TCP_SEQ_STATE_LISTENING)
2813                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2814         else
2815                 expected = bpf_iter_tcp_established_batch(seq, sk);
2816
2817         if (iter->end_sk == expected) {
2818                 iter->st_bucket_done = true;
2819                 return sk;
2820         }
2821
2822         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2823                 resized = true;
2824                 goto again;
2825         }
2826
2827         return sk;
2828 }
2829
2830 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2831 {
2832         /* bpf iter does not support lseek, so it always
2833          * continue from where it was stop()-ped.
2834          */
2835         if (*pos)
2836                 return bpf_iter_tcp_batch(seq);
2837
2838         return SEQ_START_TOKEN;
2839 }
2840
2841 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2842 {
2843         struct bpf_tcp_iter_state *iter = seq->private;
2844         struct tcp_iter_state *st = &iter->state;
2845         struct sock *sk;
2846
2847         /* Whenever seq_next() is called, the iter->cur_sk is
2848          * done with seq_show(), so advance to the next sk in
2849          * the batch.
2850          */
2851         if (iter->cur_sk < iter->end_sk) {
2852                 /* Keeping st->num consistent in tcp_iter_state.
2853                  * bpf_iter_tcp does not use st->num.
2854                  * meta.seq_num is used instead.
2855                  */
2856                 st->num++;
2857                 /* Move st->offset to the next sk in the bucket such that
2858                  * the future start() will resume at st->offset in
2859                  * st->bucket.  See tcp_seek_last_pos().
2860                  */
2861                 st->offset++;
2862                 sock_put(iter->batch[iter->cur_sk++]);
2863         }
2864
2865         if (iter->cur_sk < iter->end_sk)
2866                 sk = iter->batch[iter->cur_sk];
2867         else
2868                 sk = bpf_iter_tcp_batch(seq);
2869
2870         ++*pos;
2871         /* Keeping st->last_pos consistent in tcp_iter_state.
2872          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2873          */
2874         st->last_pos = *pos;
2875         return sk;
2876 }
2877
2878 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2879 {
2880         struct bpf_iter_meta meta;
2881         struct bpf_prog *prog;
2882         struct sock *sk = v;
2883         bool slow;
2884         uid_t uid;
2885         int ret;
2886
2887         if (v == SEQ_START_TOKEN)
2888                 return 0;
2889
2890         if (sk_fullsock(sk))
2891                 slow = lock_sock_fast(sk);
2892
2893         if (unlikely(sk_unhashed(sk))) {
2894                 ret = SEQ_SKIP;
2895                 goto unlock;
2896         }
2897
2898         if (sk->sk_state == TCP_TIME_WAIT) {
2899                 uid = 0;
2900         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2901                 const struct request_sock *req = v;
2902
2903                 uid = from_kuid_munged(seq_user_ns(seq),
2904                                        sock_i_uid(req->rsk_listener));
2905         } else {
2906                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2907         }
2908
2909         meta.seq = seq;
2910         prog = bpf_iter_get_info(&meta, false);
2911         ret = tcp_prog_seq_show(prog, &meta, v, uid);
2912
2913 unlock:
2914         if (sk_fullsock(sk))
2915                 unlock_sock_fast(sk, slow);
2916         return ret;
2917
2918 }
2919
2920 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2921 {
2922         struct bpf_tcp_iter_state *iter = seq->private;
2923         struct bpf_iter_meta meta;
2924         struct bpf_prog *prog;
2925
2926         if (!v) {
2927                 meta.seq = seq;
2928                 prog = bpf_iter_get_info(&meta, true);
2929                 if (prog)
2930                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2931         }
2932
2933         if (iter->cur_sk < iter->end_sk) {
2934                 bpf_iter_tcp_put_batch(iter);
2935                 iter->st_bucket_done = false;
2936         }
2937 }
2938
2939 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2940         .show           = bpf_iter_tcp_seq_show,
2941         .start          = bpf_iter_tcp_seq_start,
2942         .next           = bpf_iter_tcp_seq_next,
2943         .stop           = bpf_iter_tcp_seq_stop,
2944 };
2945 #endif
2946 static unsigned short seq_file_family(const struct seq_file *seq)
2947 {
2948         const struct tcp_seq_afinfo *afinfo;
2949
2950 #ifdef CONFIG_BPF_SYSCALL
2951         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2952         if (seq->op == &bpf_iter_tcp_seq_ops)
2953                 return AF_UNSPEC;
2954 #endif
2955
2956         /* Iterated from proc fs */
2957         afinfo = pde_data(file_inode(seq->file));
2958         return afinfo->family;
2959 }
2960
2961 static const struct seq_operations tcp4_seq_ops = {
2962         .show           = tcp4_seq_show,
2963         .start          = tcp_seq_start,
2964         .next           = tcp_seq_next,
2965         .stop           = tcp_seq_stop,
2966 };
2967
2968 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2969         .family         = AF_INET,
2970 };
2971
2972 static int __net_init tcp4_proc_init_net(struct net *net)
2973 {
2974         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2975                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2976                 return -ENOMEM;
2977         return 0;
2978 }
2979
2980 static void __net_exit tcp4_proc_exit_net(struct net *net)
2981 {
2982         remove_proc_entry("tcp", net->proc_net);
2983 }
2984
2985 static struct pernet_operations tcp4_net_ops = {
2986         .init = tcp4_proc_init_net,
2987         .exit = tcp4_proc_exit_net,
2988 };
2989
2990 int __init tcp4_proc_init(void)
2991 {
2992         return register_pernet_subsys(&tcp4_net_ops);
2993 }
2994
2995 void tcp4_proc_exit(void)
2996 {
2997         unregister_pernet_subsys(&tcp4_net_ops);
2998 }
2999 #endif /* CONFIG_PROC_FS */
3000
3001 /* @wake is one when sk_stream_write_space() calls us.
3002  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3003  * This mimics the strategy used in sock_def_write_space().
3004  */
3005 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3006 {
3007         const struct tcp_sock *tp = tcp_sk(sk);
3008         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3009                             READ_ONCE(tp->snd_nxt);
3010
3011         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3012 }
3013 EXPORT_SYMBOL(tcp_stream_memory_free);
3014
3015 struct proto tcp_prot = {
3016         .name                   = "TCP",
3017         .owner                  = THIS_MODULE,
3018         .close                  = tcp_close,
3019         .pre_connect            = tcp_v4_pre_connect,
3020         .connect                = tcp_v4_connect,
3021         .disconnect             = tcp_disconnect,
3022         .accept                 = inet_csk_accept,
3023         .ioctl                  = tcp_ioctl,
3024         .init                   = tcp_v4_init_sock,
3025         .destroy                = tcp_v4_destroy_sock,
3026         .shutdown               = tcp_shutdown,
3027         .setsockopt             = tcp_setsockopt,
3028         .getsockopt             = tcp_getsockopt,
3029         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3030         .keepalive              = tcp_set_keepalive,
3031         .recvmsg                = tcp_recvmsg,
3032         .sendmsg                = tcp_sendmsg,
3033         .sendpage               = tcp_sendpage,
3034         .backlog_rcv            = tcp_v4_do_rcv,
3035         .release_cb             = tcp_release_cb,
3036         .hash                   = inet_hash,
3037         .unhash                 = inet_unhash,
3038         .get_port               = inet_csk_get_port,
3039         .put_port               = inet_put_port,
3040 #ifdef CONFIG_BPF_SYSCALL
3041         .psock_update_sk_prot   = tcp_bpf_update_proto,
3042 #endif
3043         .enter_memory_pressure  = tcp_enter_memory_pressure,
3044         .leave_memory_pressure  = tcp_leave_memory_pressure,
3045         .stream_memory_free     = tcp_stream_memory_free,
3046         .sockets_allocated      = &tcp_sockets_allocated,
3047         .orphan_count           = &tcp_orphan_count,
3048         .memory_allocated       = &tcp_memory_allocated,
3049         .memory_pressure        = &tcp_memory_pressure,
3050         .sysctl_mem             = sysctl_tcp_mem,
3051         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3052         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3053         .max_header             = MAX_TCP_HEADER,
3054         .obj_size               = sizeof(struct tcp_sock),
3055         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3056         .twsk_prot              = &tcp_timewait_sock_ops,
3057         .rsk_prot               = &tcp_request_sock_ops,
3058         .h.hashinfo             = &tcp_hashinfo,
3059         .no_autobind            = true,
3060         .diag_destroy           = tcp_abort,
3061 };
3062 EXPORT_SYMBOL(tcp_prot);
3063
3064 static void __net_exit tcp_sk_exit(struct net *net)
3065 {
3066         struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3067
3068         if (net->ipv4.tcp_congestion_control)
3069                 bpf_module_put(net->ipv4.tcp_congestion_control,
3070                                net->ipv4.tcp_congestion_control->owner);
3071         if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3072                 kfree(tcp_death_row);
3073 }
3074
3075 static int __net_init tcp_sk_init(struct net *net)
3076 {
3077         int cnt;
3078
3079         net->ipv4.sysctl_tcp_ecn = 2;
3080         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3081
3082         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3083         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3084         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3085         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3086         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3087
3088         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3089         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3090         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3091
3092         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3093         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3094         net->ipv4.sysctl_tcp_syncookies = 1;
3095         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3096         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3097         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3098         net->ipv4.sysctl_tcp_orphan_retries = 0;
3099         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3100         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3101         net->ipv4.sysctl_tcp_tw_reuse = 2;
3102         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3103
3104         net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3105         if (!net->ipv4.tcp_death_row)
3106                 return -ENOMEM;
3107         refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3108         cnt = tcp_hashinfo.ehash_mask + 1;
3109         net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3110         net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3111
3112         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3113         net->ipv4.sysctl_tcp_sack = 1;
3114         net->ipv4.sysctl_tcp_window_scaling = 1;
3115         net->ipv4.sysctl_tcp_timestamps = 1;
3116         net->ipv4.sysctl_tcp_early_retrans = 3;
3117         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3118         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3119         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3120         net->ipv4.sysctl_tcp_max_reordering = 300;
3121         net->ipv4.sysctl_tcp_dsack = 1;
3122         net->ipv4.sysctl_tcp_app_win = 31;
3123         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3124         net->ipv4.sysctl_tcp_frto = 2;
3125         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3126         /* This limits the percentage of the congestion window which we
3127          * will allow a single TSO frame to consume.  Building TSO frames
3128          * which are too large can cause TCP streams to be bursty.
3129          */
3130         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3131         /* Default TSQ limit of 16 TSO segments */
3132         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3133         /* rfc5961 challenge ack rate limiting */
3134         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3135         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3136         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3137         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3138         net->ipv4.sysctl_tcp_autocorking = 1;
3139         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3140         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3141         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3142         if (net != &init_net) {
3143                 memcpy(net->ipv4.sysctl_tcp_rmem,
3144                        init_net.ipv4.sysctl_tcp_rmem,
3145                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3146                 memcpy(net->ipv4.sysctl_tcp_wmem,
3147                        init_net.ipv4.sysctl_tcp_wmem,
3148                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3149         }
3150         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3151         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3152         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3153         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3154         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3155         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3156
3157         /* Reno is always built in */
3158         if (!net_eq(net, &init_net) &&
3159             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3160                                init_net.ipv4.tcp_congestion_control->owner))
3161                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3162         else
3163                 net->ipv4.tcp_congestion_control = &tcp_reno;
3164
3165         return 0;
3166 }
3167
3168 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3169 {
3170         struct net *net;
3171
3172         inet_twsk_purge(&tcp_hashinfo, AF_INET);
3173
3174         list_for_each_entry(net, net_exit_list, exit_list)
3175                 tcp_fastopen_ctx_destroy(net);
3176 }
3177
3178 static struct pernet_operations __net_initdata tcp_sk_ops = {
3179        .init       = tcp_sk_init,
3180        .exit       = tcp_sk_exit,
3181        .exit_batch = tcp_sk_exit_batch,
3182 };
3183
3184 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3185 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3186                      struct sock_common *sk_common, uid_t uid)
3187
3188 #define INIT_BATCH_SZ 16
3189
3190 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3191 {
3192         struct bpf_tcp_iter_state *iter = priv_data;
3193         int err;
3194
3195         err = bpf_iter_init_seq_net(priv_data, aux);
3196         if (err)
3197                 return err;
3198
3199         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3200         if (err) {
3201                 bpf_iter_fini_seq_net(priv_data);
3202                 return err;
3203         }
3204
3205         return 0;
3206 }
3207
3208 static void bpf_iter_fini_tcp(void *priv_data)
3209 {
3210         struct bpf_tcp_iter_state *iter = priv_data;
3211
3212         bpf_iter_fini_seq_net(priv_data);
3213         kvfree(iter->batch);
3214 }
3215
3216 static const struct bpf_iter_seq_info tcp_seq_info = {
3217         .seq_ops                = &bpf_iter_tcp_seq_ops,
3218         .init_seq_private       = bpf_iter_init_tcp,
3219         .fini_seq_private       = bpf_iter_fini_tcp,
3220         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3221 };
3222
3223 static const struct bpf_func_proto *
3224 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3225                             const struct bpf_prog *prog)
3226 {
3227         switch (func_id) {
3228         case BPF_FUNC_setsockopt:
3229                 return &bpf_sk_setsockopt_proto;
3230         case BPF_FUNC_getsockopt:
3231                 return &bpf_sk_getsockopt_proto;
3232         default:
3233                 return NULL;
3234         }
3235 }
3236
3237 static struct bpf_iter_reg tcp_reg_info = {
3238         .target                 = "tcp",
3239         .ctx_arg_info_size      = 1,
3240         .ctx_arg_info           = {
3241                 { offsetof(struct bpf_iter__tcp, sk_common),
3242                   PTR_TO_BTF_ID_OR_NULL },
3243         },
3244         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3245         .seq_info               = &tcp_seq_info,
3246 };
3247
3248 static void __init bpf_iter_register(void)
3249 {
3250         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3251         if (bpf_iter_reg_target(&tcp_reg_info))
3252                 pr_warn("Warning: could not register bpf iterator tcp\n");
3253 }
3254
3255 #endif
3256
3257 void __init tcp_v4_init(void)
3258 {
3259         int cpu, res;
3260
3261         for_each_possible_cpu(cpu) {
3262                 struct sock *sk;
3263
3264                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3265                                            IPPROTO_TCP, &init_net);
3266                 if (res)
3267                         panic("Failed to create the TCP control socket.\n");
3268                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3269
3270                 /* Please enforce IP_DF and IPID==0 for RST and
3271                  * ACK sent in SYN-RECV and TIME-WAIT state.
3272                  */
3273                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3274
3275                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3276         }
3277         if (register_pernet_subsys(&tcp_sk_ops))
3278                 panic("Failed to create the TCP control socket.\n");
3279
3280 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3281         bpf_iter_register();
3282 #endif
3283 }