net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83 #include <linux/inetdevice.h>
  84
  85 #include <crypto/hash.h>
  86 #include <linux/scatterlist.h>
  87
  88 #include <trace/events/tcp.h>
  89
  90 #ifdef CONFIG_TCP_MD5SIG
  91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93 #endif
  94
  95 struct inet_hashinfo tcp_hashinfo;
  96 EXPORT_SYMBOL(tcp_hashinfo);
  97
  98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  99 {
 100         return secure_tcp_seq(ip_hdr(skb)->daddr,
 101                               ip_hdr(skb)->saddr,
 102                               tcp_hdr(skb)->dest,
 103                               tcp_hdr(skb)->source);
 104 }
 105
 106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 107 {
 108         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 109 }
 110
 111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 112 {
 113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 114         struct tcp_sock *tp = tcp_sk(sk);
 115
 116         /* With PAWS, it is safe from the viewpoint
 117            of data integrity. Even without PAWS it is safe provided sequence
 118            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 119
 120            Actually, the idea is close to VJ's one, only timestamp cache is
 121            held not per host, but per port pair and TW bucket is used as state
 122            holder.
 123
 124            If TW bucket has been already destroyed we fall back to VJ's scheme
 125            and use initial timestamp retrieved from peer table.
 126          */
 127         if (tcptw->tw_ts_recent_stamp &&
 128             (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
 129                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 130                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 131                 if (tp->write_seq == 0)
 132                         tp->write_seq = 1;
 133                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 134                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 135                 sock_hold(sktw);
 136                 return 1;
 137         }
 138
 139         return 0;
 140 }
 141 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 142
 143 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 144                               int addr_len)
 145 {
 146         /* This check is replicated from tcp_v4_connect() and intended to
 147          * prevent BPF program called below from accessing bytes that are out
 148          * of the bound specified by user in addr_len.
 149          */
 150         if (addr_len < sizeof(struct sockaddr_in))
 151                 return -EINVAL;
 152
 153         sock_owned_by_me(sk);
 154
 155         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 156 }
 157
 158 /* This will initiate an outgoing connection. */
 159 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 160 {
 161         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 162         struct inet_sock *inet = inet_sk(sk);
 163         struct tcp_sock *tp = tcp_sk(sk);
 164         __be16 orig_sport, orig_dport;
 165         __be32 daddr, nexthop;
 166         struct flowi4 *fl4;
 167         struct rtable *rt;
 168         int err;
 169         struct ip_options_rcu *inet_opt;
 170         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 171
 172         if (addr_len < sizeof(struct sockaddr_in))
 173                 return -EINVAL;
 174
 175         if (usin->sin_family != AF_INET)
 176                 return -EAFNOSUPPORT;
 177
 178         nexthop = daddr = usin->sin_addr.s_addr;
 179         inet_opt = rcu_dereference_protected(inet->inet_opt,
 180                                              lockdep_sock_is_held(sk));
 181         if (inet_opt && inet_opt->opt.srr) {
 182                 if (!daddr)
 183                         return -EINVAL;
 184                 nexthop = inet_opt->opt.faddr;
 185         }
 186
 187         orig_sport = inet->inet_sport;
 188         orig_dport = usin->sin_port;
 189         fl4 = &inet->cork.fl.u.ip4;
 190         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 191                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 192                               IPPROTO_TCP,
 193                               orig_sport, orig_dport, sk);
 194         if (IS_ERR(rt)) {
 195                 err = PTR_ERR(rt);
 196                 if (err == -ENETUNREACH)
 197                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 198                 return err;
 199         }
 200
 201         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 202                 ip_rt_put(rt);
 203                 return -ENETUNREACH;
 204         }
 205
 206         if (!inet_opt || !inet_opt->opt.srr)
 207                 daddr = fl4->daddr;
 208
 209         if (!inet->inet_saddr)
 210                 inet->inet_saddr = fl4->saddr;
 211         sk_rcv_saddr_set(sk, inet->inet_saddr);
 212
 213         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 214                 /* Reset inherited state */
 215                 tp->rx_opt.ts_recent       = 0;
 216                 tp->rx_opt.ts_recent_stamp = 0;
 217                 if (likely(!tp->repair))
 218                         tp->write_seq      = 0;
 219         }
 220
 221         inet->inet_dport = usin->sin_port;
 222         sk_daddr_set(sk, daddr);
 223
 224         inet_csk(sk)->icsk_ext_hdr_len = 0;
 225         if (inet_opt)
 226                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 227
 228         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 229
 230         /* Socket identity is still unknown (sport may be zero).
 231          * However we set state to SYN-SENT and not releasing socket
 232          * lock select source port, enter ourselves into the hash tables and
 233          * complete initialization after this.
 234          */
 235         tcp_set_state(sk, TCP_SYN_SENT);
 236         err = inet_hash_connect(tcp_death_row, sk);
 237         if (err)
 238                 goto failure;
 239
 240         sk_set_txhash(sk);
 241
 242         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 243                                inet->inet_sport, inet->inet_dport, sk);
 244         if (IS_ERR(rt)) {
 245                 err = PTR_ERR(rt);
 246                 rt = NULL;
 247                 goto failure;
 248         }
 249         /* OK, now commit destination to socket.  */
 250         sk->sk_gso_type = SKB_GSO_TCPV4;
 251         sk_setup_caps(sk, &rt->dst);
 252         rt = NULL;
 253
 254         if (likely(!tp->repair)) {
 255                 if (!tp->write_seq)
 256                         tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 257                                                        inet->inet_daddr,
 258                                                        inet->inet_sport,
 259                                                        usin->sin_port);
 260                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 261                                                  inet->inet_saddr,
 262                                                  inet->inet_daddr);
 263         }
 264
 265         inet->inet_id = tp->write_seq ^ jiffies;
 266
 267         if (tcp_fastopen_defer_connect(sk, &err))
 268                 return err;
 269         if (err)
 270                 goto failure;
 271
 272         err = tcp_connect(sk);
 273
 274         if (err)
 275                 goto failure;
 276
 277         return 0;
 278
 279 failure:
 280         /*
 281          * This unhashes the socket and releases the local port,
 282          * if necessary.
 283          */
 284         tcp_set_state(sk, TCP_CLOSE);
 285         ip_rt_put(rt);
 286         sk->sk_route_caps = 0;
 287         inet->inet_dport = 0;
 288         return err;
 289 }
 290 EXPORT_SYMBOL(tcp_v4_connect);
 291
 292 /*
 293  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 294  * It can be called through tcp_release_cb() if socket was owned by user
 295  * at the time tcp_v4_err() was called to handle ICMP message.
 296  */
 297 void tcp_v4_mtu_reduced(struct sock *sk)
 298 {
 299         struct inet_sock *inet = inet_sk(sk);
 300         struct dst_entry *dst;
 301         u32 mtu;
 302
 303         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 304                 return;
 305         mtu = tcp_sk(sk)->mtu_info;
 306         dst = inet_csk_update_pmtu(sk, mtu);
 307         if (!dst)
 308                 return;
 309
 310         /* Something is about to be wrong... Remember soft error
 311          * for the case, if this connection will not able to recover.
 312          */
 313         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 314                 sk->sk_err_soft = EMSGSIZE;
 315
 316         mtu = dst_mtu(dst);
 317
 318         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 319             ip_sk_accept_pmtu(sk) &&
 320             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 321                 tcp_sync_mss(sk, mtu);
 322
 323                 /* Resend the TCP packet because it's
 324                  * clear that the old packet has been
 325                  * dropped. This is the new "fast" path mtu
 326                  * discovery.
 327                  */
 328                 tcp_simple_retransmit(sk);
 329         } /* else let the usual retransmit timer handle it */
 330 }
 331 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 332
 333 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 334 {
 335         struct dst_entry *dst = __sk_dst_check(sk, 0);
 336
 337         if (dst)
 338                 dst->ops->redirect(dst, sk, skb);
 339 }
 340
 341
 342 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 343 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 344 {
 345         struct request_sock *req = inet_reqsk(sk);
 346         struct net *net = sock_net(sk);
 347
 348         /* ICMPs are not backlogged, hence we cannot get
 349          * an established socket here.
 350          */
 351         if (seq != tcp_rsk(req)->snt_isn) {
 352                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 353         } else if (abort) {
 354                 /*
 355                  * Still in SYN_RECV, just remove it silently.
 356                  * There is no good way to pass the error to the newly
 357                  * created socket, and POSIX does not want network
 358                  * errors returned from accept().
 359                  */
 360                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 361                 tcp_listendrop(req->rsk_listener);
 362         }
 363         reqsk_put(req);
 364 }
 365 EXPORT_SYMBOL(tcp_req_err);
 366
 367 /*
 368  * This routine is called by the ICMP module when it gets some
 369  * sort of error condition.  If err < 0 then the socket should
 370  * be closed and the error returned to the user.  If err > 0
 371  * it's just the icmp type << 8 | icmp code.  After adjustment
 372  * header points to the first 8 bytes of the tcp header.  We need
 373  * to find the appropriate port.
 374  *
 375  * The locking strategy used here is very "optimistic". When
 376  * someone else accesses the socket the ICMP is just dropped
 377  * and for some paths there is no check at all.
 378  * A more general error queue to queue errors for later handling
 379  * is probably better.
 380  *
 381  */
 382
 383 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 384 {
 385         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 386         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 387         struct inet_connection_sock *icsk;
 388         struct tcp_sock *tp;
 389         struct inet_sock *inet;
 390         const int type = icmp_hdr(icmp_skb)->type;
 391         const int code = icmp_hdr(icmp_skb)->code;
 392         struct sock *sk;
 393         struct sk_buff *skb;
 394         struct request_sock *fastopen;
 395         u32 seq, snd_una;
 396         s32 remaining;
 397         u32 delta_us;
 398         int err;
 399         struct net *net = dev_net(icmp_skb->dev);
 400
 401         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 402                                        th->dest, iph->saddr, ntohs(th->source),
 403                                        inet_iif(icmp_skb), 0);
 404         if (!sk) {
 405                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 406                 return;
 407         }
 408         if (sk->sk_state == TCP_TIME_WAIT) {
 409                 inet_twsk_put(inet_twsk(sk));
 410                 return;
 411         }
 412         seq = ntohl(th->seq);
 413         if (sk->sk_state == TCP_NEW_SYN_RECV)
 414                 return tcp_req_err(sk, seq,
 415                                   type == ICMP_PARAMETERPROB ||
 416                                   type == ICMP_TIME_EXCEEDED ||
 417                                   (type == ICMP_DEST_UNREACH &&
 418                                    (code == ICMP_NET_UNREACH ||
 419                                     code == ICMP_HOST_UNREACH)));
 420
 421         bh_lock_sock(sk);
 422         /* If too many ICMPs get dropped on busy
 423          * servers this needs to be solved differently.
 424          * We do take care of PMTU discovery (RFC1191) special case :
 425          * we can receive locally generated ICMP messages while socket is held.
 426          */
 427         if (sock_owned_by_user(sk)) {
 428                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 429                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 430         }
 431         if (sk->sk_state == TCP_CLOSE)
 432                 goto out;
 433
 434         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 435                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 436                 goto out;
 437         }
 438
 439         icsk = inet_csk(sk);
 440         tp = tcp_sk(sk);
 441         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 442         fastopen = tp->fastopen_rsk;
 443         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 444         if (sk->sk_state != TCP_LISTEN &&
 445             !between(seq, snd_una, tp->snd_nxt)) {
 446                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 447                 goto out;
 448         }
 449
 450         switch (type) {
 451         case ICMP_REDIRECT:
 452                 if (!sock_owned_by_user(sk))
 453                         do_redirect(icmp_skb, sk);
 454                 goto out;
 455         case ICMP_SOURCE_QUENCH:
 456                 /* Just silently ignore these. */
 457                 goto out;
 458         case ICMP_PARAMETERPROB:
 459                 err = EPROTO;
 460                 break;
 461         case ICMP_DEST_UNREACH:
 462                 if (code > NR_ICMP_UNREACH)
 463                         goto out;
 464
 465                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 466                         /* We are not interested in TCP_LISTEN and open_requests
 467                          * (SYN-ACKs send out by Linux are always <576bytes so
 468                          * they should go through unfragmented).
 469                          */
 470                         if (sk->sk_state == TCP_LISTEN)
 471                                 goto out;
 472
 473                         tp->mtu_info = info;
 474                         if (!sock_owned_by_user(sk)) {
 475                                 tcp_v4_mtu_reduced(sk);
 476                         } else {
 477                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 478                                         sock_hold(sk);
 479                         }
 480                         goto out;
 481                 }
 482
 483                 err = icmp_err_convert[code].errno;
 484                 /* check if icmp_skb allows revert of backoff
 485                  * (see draft-zimmermann-tcp-lcd) */
 486                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 487                         break;
 488                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 489                     !icsk->icsk_backoff || fastopen)
 490                         break;
 491
 492                 if (sock_owned_by_user(sk))
 493                         break;
 494
 495                 icsk->icsk_backoff--;
 496                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 497                                                TCP_TIMEOUT_INIT;
 498                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 499
 500                 skb = tcp_rtx_queue_head(sk);
 501                 BUG_ON(!skb);
 502
 503                 tcp_mstamp_refresh(tp);
 504                 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
 505                 remaining = icsk->icsk_rto -
 506                             usecs_to_jiffies(delta_us);
 507
 508                 if (remaining > 0) {
 509                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 510                                                   remaining, TCP_RTO_MAX);
 511                 } else {
 512                         /* RTO revert clocked out retransmission.
 513                          * Will retransmit now */
 514                         tcp_retransmit_timer(sk);
 515                 }
 516
 517                 break;
 518         case ICMP_TIME_EXCEEDED:
 519                 err = EHOSTUNREACH;
 520                 break;
 521         default:
 522                 goto out;
 523         }
 524
 525         switch (sk->sk_state) {
 526         case TCP_SYN_SENT:
 527         case TCP_SYN_RECV:
 528                 /* Only in fast or simultaneous open. If a fast open socket is
 529                  * is already accepted it is treated as a connected one below.
 530                  */
 531                 if (fastopen && !fastopen->sk)
 532                         break;
 533
 534                 if (!sock_owned_by_user(sk)) {
 535                         sk->sk_err = err;
 536
 537                         sk->sk_error_report(sk);
 538
 539                         tcp_done(sk);
 540                 } else {
 541                         sk->sk_err_soft = err;
 542                 }
 543                 goto out;
 544         }
 545
 546         /* If we've already connected we will keep trying
 547          * until we time out, or the user gives up.
 548          *
 549          * rfc1122 4.2.3.9 allows to consider as hard errors
 550          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 551          * but it is obsoleted by pmtu discovery).
 552          *
 553          * Note, that in modern internet, where routing is unreliable
 554          * and in each dark corner broken firewalls sit, sending random
 555          * errors ordered by their masters even this two messages finally lose
 556          * their original sense (even Linux sends invalid PORT_UNREACHs)
 557          *
 558          * Now we are in compliance with RFCs.
 559          *                                                      --ANK (980905)
 560          */
 561
 562         inet = inet_sk(sk);
 563         if (!sock_owned_by_user(sk) && inet->recverr) {
 564                 sk->sk_err = err;
 565                 sk->sk_error_report(sk);
 566         } else  { /* Only an error on timeout */
 567                 sk->sk_err_soft = err;
 568         }
 569
 570 out:
 571         bh_unlock_sock(sk);
 572         sock_put(sk);
 573 }
 574
 575 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 576 {
 577         struct tcphdr *th = tcp_hdr(skb);
 578
 579         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 580         skb->csum_start = skb_transport_header(skb) - skb->head;
 581         skb->csum_offset = offsetof(struct tcphdr, check);
 582 }
 583
 584 /* This routine computes an IPv4 TCP checksum. */
 585 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 586 {
 587         const struct inet_sock *inet = inet_sk(sk);
 588
 589         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 590 }
 591 EXPORT_SYMBOL(tcp_v4_send_check);
 592
 593 /*
 594  *      This routine will send an RST to the other tcp.
 595  *
 596  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 597  *                    for reset.
 598  *      Answer: if a packet caused RST, it is not for a socket
 599  *              existing in our system, if it is matched to a socket,
 600  *              it is just duplicate segment or bug in other side's TCP.
 601  *              So that we build reply only basing on parameters
 602  *              arrived with segment.
 603  *      Exception: precedence violation. We do not implement it in any case.
 604  */
 605
 606 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 607 {
 608         const struct tcphdr *th = tcp_hdr(skb);
 609         struct {
 610                 struct tcphdr th;
 611 #ifdef CONFIG_TCP_MD5SIG
 612                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 613 #endif
 614         } rep;
 615         struct ip_reply_arg arg;
 616 #ifdef CONFIG_TCP_MD5SIG
 617         struct tcp_md5sig_key *key = NULL;
 618         const __u8 *hash_location = NULL;
 619         unsigned char newhash[16];
 620         int genhash;
 621         struct sock *sk1 = NULL;
 622 #endif
 623         struct net *net;
 624
 625         /* Never send a reset in response to a reset. */
 626         if (th->rst)
 627                 return;
 628
 629         /* If sk not NULL, it means we did a successful lookup and incoming
 630          * route had to be correct. prequeue might have dropped our dst.
 631          */
 632         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 633                 return;
 634
 635         /* Swap the send and the receive. */
 636         memset(&rep, 0, sizeof(rep));
 637         rep.th.dest   = th->source;
 638         rep.th.source = th->dest;
 639         rep.th.doff   = sizeof(struct tcphdr) / 4;
 640         rep.th.rst    = 1;
 641
 642         if (th->ack) {
 643                 rep.th.seq = th->ack_seq;
 644         } else {
 645                 rep.th.ack = 1;
 646                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 647                                        skb->len - (th->doff << 2));
 648         }
 649
 650         memset(&arg, 0, sizeof(arg));
 651         arg.iov[0].iov_base = (unsigned char *)&rep;
 652         arg.iov[0].iov_len  = sizeof(rep.th);
 653
 654         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 655 #ifdef CONFIG_TCP_MD5SIG
 656         rcu_read_lock();
 657         hash_location = tcp_parse_md5sig_option(th);
 658         if (sk && sk_fullsock(sk)) {
 659                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 660                                         &ip_hdr(skb)->saddr, AF_INET);
 661         } else if (hash_location) {
 662                 /*
 663                  * active side is lost. Try to find listening socket through
 664                  * source port, and then find md5 key through listening socket.
 665                  * we are not loose security here:
 666                  * Incoming packet is checked with md5 hash with finding key,
 667                  * no RST generated if md5 hash doesn't match.
 668                  */
 669                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 670                                              ip_hdr(skb)->saddr,
 671                                              th->source, ip_hdr(skb)->daddr,
 672                                              ntohs(th->source), inet_iif(skb),
 673                                              tcp_v4_sdif(skb));
 674                 /* don't send rst if it can't find key */
 675                 if (!sk1)
 676                         goto out;
 677
 678                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 679                                         &ip_hdr(skb)->saddr, AF_INET);
 680                 if (!key)
 681                         goto out;
 682
 683
 684                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 685                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 686                         goto out;
 687
 688         }
 689
 690         if (key) {
 691                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 692                                    (TCPOPT_NOP << 16) |
 693                                    (TCPOPT_MD5SIG << 8) |
 694                                    TCPOLEN_MD5SIG);
 695                 /* Update length and the length the header thinks exists */
 696                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 697                 rep.th.doff = arg.iov[0].iov_len / 4;
 698
 699                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 700                                      key, ip_hdr(skb)->saddr,
 701                                      ip_hdr(skb)->daddr, &rep.th);
 702         }
 703 #endif
 704         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 705                                       ip_hdr(skb)->saddr, /* XXX */
 706                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 707         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 708         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 709
 710         /* When socket is gone, all binding information is lost.
 711          * routing might fail in this case. No choice here, if we choose to force
 712          * input interface, we will misroute in case of asymmetric route.
 713          */
 714         if (sk) {
 715                 arg.bound_dev_if = sk->sk_bound_dev_if;
 716                 if (sk_fullsock(sk))
 717                         trace_tcp_send_reset(sk, skb);
 718         }
 719
 720         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 721                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 722
 723         arg.tos = ip_hdr(skb)->tos;
 724         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 725         local_bh_disable();
 726         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 727                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 728                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 729                               &arg, arg.iov[0].iov_len);
 730
 731         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 732         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 733         local_bh_enable();
 734
 735 #ifdef CONFIG_TCP_MD5SIG
 736 out:
 737         rcu_read_unlock();
 738 #endif
 739 }
 740
 741 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 742    outside socket context is ugly, certainly. What can I do?
 743  */
 744
 745 static void tcp_v4_send_ack(const struct sock *sk,
 746                             struct sk_buff *skb, u32 seq, u32 ack,
 747                             u32 win, u32 tsval, u32 tsecr, int oif,
 748                             struct tcp_md5sig_key *key,
 749                             int reply_flags, u8 tos)
 750 {
 751         const struct tcphdr *th = tcp_hdr(skb);
 752         struct {
 753                 struct tcphdr th;
 754                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 755 #ifdef CONFIG_TCP_MD5SIG
 756                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 757 #endif
 758                         ];
 759         } rep;
 760         struct net *net = sock_net(sk);
 761         struct ip_reply_arg arg;
 762
 763         memset(&rep.th, 0, sizeof(struct tcphdr));
 764         memset(&arg, 0, sizeof(arg));
 765
 766         arg.iov[0].iov_base = (unsigned char *)&rep;
 767         arg.iov[0].iov_len  = sizeof(rep.th);
 768         if (tsecr) {
 769                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 770                                    (TCPOPT_TIMESTAMP << 8) |
 771                                    TCPOLEN_TIMESTAMP);
 772                 rep.opt[1] = htonl(tsval);
 773                 rep.opt[2] = htonl(tsecr);
 774                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 775         }
 776
 777         /* Swap the send and the receive. */
 778         rep.th.dest    = th->source;
 779         rep.th.source  = th->dest;
 780         rep.th.doff    = arg.iov[0].iov_len / 4;
 781         rep.th.seq     = htonl(seq);
 782         rep.th.ack_seq = htonl(ack);
 783         rep.th.ack     = 1;
 784         rep.th.window  = htons(win);
 785
 786 #ifdef CONFIG_TCP_MD5SIG
 787         if (key) {
 788                 int offset = (tsecr) ? 3 : 0;
 789
 790                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 791                                           (TCPOPT_NOP << 16) |
 792                                           (TCPOPT_MD5SIG << 8) |
 793                                           TCPOLEN_MD5SIG);
 794                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 795                 rep.th.doff = arg.iov[0].iov_len/4;
 796
 797                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 798                                     key, ip_hdr(skb)->saddr,
 799                                     ip_hdr(skb)->daddr, &rep.th);
 800         }
 801 #endif
 802         arg.flags = reply_flags;
 803         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 804                                       ip_hdr(skb)->saddr, /* XXX */
 805                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 806         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 807         if (oif)
 808                 arg.bound_dev_if = oif;
 809         arg.tos = tos;
 810         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 811         local_bh_disable();
 812         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 813                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 814                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 815                               &arg, arg.iov[0].iov_len);
 816
 817         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 818         local_bh_enable();
 819 }
 820
 821 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 822 {
 823         struct inet_timewait_sock *tw = inet_twsk(sk);
 824         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 825
 826         tcp_v4_send_ack(sk, skb,
 827                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 828                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 829                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 830                         tcptw->tw_ts_recent,
 831                         tw->tw_bound_dev_if,
 832                         tcp_twsk_md5_key(tcptw),
 833                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 834                         tw->tw_tos
 835                         );
 836
 837         inet_twsk_put(tw);
 838 }
 839
 840 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 841                                   struct request_sock *req)
 842 {
 843         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 844          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 845          */
 846         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 847                                              tcp_sk(sk)->snd_nxt;
 848
 849         /* RFC 7323 2.3
 850          * The window field (SEG.WND) of every outgoing segment, with the
 851          * exception of <SYN> segments, MUST be right-shifted by
 852          * Rcv.Wind.Shift bits:
 853          */
 854         tcp_v4_send_ack(sk, skb, seq,
 855                         tcp_rsk(req)->rcv_nxt,
 856                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 857                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 858                         req->ts_recent,
 859                         0,
 860                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 861                                           AF_INET),
 862                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 863                         ip_hdr(skb)->tos);
 864 }
 865
 866 /*
 867  *      Send a SYN-ACK after having received a SYN.
 868  *      This still operates on a request_sock only, not on a big
 869  *      socket.
 870  */
 871 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 872                               struct flowi *fl,
 873                               struct request_sock *req,
 874                               struct tcp_fastopen_cookie *foc,
 875                               enum tcp_synack_type synack_type)
 876 {
 877         const struct inet_request_sock *ireq = inet_rsk(req);
 878         struct flowi4 fl4;
 879         int err = -1;
 880         struct sk_buff *skb;
 881
 882         /* First, grab a route. */
 883         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 884                 return -1;
 885
 886         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 887
 888         if (skb) {
 889                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 890
 891                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 892                                             ireq->ir_rmt_addr,
 893                                             ireq_opt_deref(ireq));
 894                 err = net_xmit_eval(err);
 895         }
 896
 897         return err;
 898 }
 899
 900 /*
 901  *      IPv4 request_sock destructor.
 902  */
 903 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 904 {
 905         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 906 }
 907
 908 #ifdef CONFIG_TCP_MD5SIG
 909 /*
 910  * RFC2385 MD5 checksumming requires a mapping of
 911  * IP address->MD5 Key.
 912  * We need to maintain these in the sk structure.
 913  */
 914
 915 /* Find the Key structure for an address.  */
 916 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 917                                          const union tcp_md5_addr *addr,
 918                                          int family)
 919 {
 920         const struct tcp_sock *tp = tcp_sk(sk);
 921         struct tcp_md5sig_key *key;
 922         const struct tcp_md5sig_info *md5sig;
 923         __be32 mask;
 924         struct tcp_md5sig_key *best_match = NULL;
 925         bool match;
 926
 927         /* caller either holds rcu_read_lock() or socket lock */
 928         md5sig = rcu_dereference_check(tp->md5sig_info,
 929                                        lockdep_sock_is_held(sk));
 930         if (!md5sig)
 931                 return NULL;
 932
 933         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 934                 if (key->family != family)
 935                         continue;
 936
 937                 if (family == AF_INET) {
 938                         mask = inet_make_mask(key->prefixlen);
 939                         match = (key->addr.a4.s_addr & mask) ==
 940                                 (addr->a4.s_addr & mask);
 941 #if IS_ENABLED(CONFIG_IPV6)
 942                 } else if (family == AF_INET6) {
 943                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
 944                                                   key->prefixlen);
 945 #endif
 946                 } else {
 947                         match = false;
 948                 }
 949
 950                 if (match && (!best_match ||
 951                               key->prefixlen > best_match->prefixlen))
 952                         best_match = key;
 953         }
 954         return best_match;
 955 }
 956 EXPORT_SYMBOL(tcp_md5_do_lookup);
 957
 958 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
 959                                                       const union tcp_md5_addr *addr,
 960                                                       int family, u8 prefixlen)
 961 {
 962         const struct tcp_sock *tp = tcp_sk(sk);
 963         struct tcp_md5sig_key *key;
 964         unsigned int size = sizeof(struct in_addr);
 965         const struct tcp_md5sig_info *md5sig;
 966
 967         /* caller either holds rcu_read_lock() or socket lock */
 968         md5sig = rcu_dereference_check(tp->md5sig_info,
 969                                        lockdep_sock_is_held(sk));
 970         if (!md5sig)
 971                 return NULL;
 972 #if IS_ENABLED(CONFIG_IPV6)
 973         if (family == AF_INET6)
 974                 size = sizeof(struct in6_addr);
 975 #endif
 976         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 977                 if (key->family != family)
 978                         continue;
 979                 if (!memcmp(&key->addr, addr, size) &&
 980                     key->prefixlen == prefixlen)
 981                         return key;
 982         }
 983         return NULL;
 984 }
 985
 986 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 987                                          const struct sock *addr_sk)
 988 {
 989         const union tcp_md5_addr *addr;
 990
 991         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 992         return tcp_md5_do_lookup(sk, addr, AF_INET);
 993 }
 994 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 995
 996 /* This can be called on a newly created socket, from other files */
 997 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 998                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
 999                    gfp_t gfp)
1000 {
1001         /* Add Key to the list */
1002         struct tcp_md5sig_key *key;
1003         struct tcp_sock *tp = tcp_sk(sk);
1004         struct tcp_md5sig_info *md5sig;
1005
1006         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1007         if (key) {
1008                 /* Pre-existing entry - just update that one. */
1009                 memcpy(key->key, newkey, newkeylen);
1010                 key->keylen = newkeylen;
1011                 return 0;
1012         }
1013
1014         md5sig = rcu_dereference_protected(tp->md5sig_info,
1015                                            lockdep_sock_is_held(sk));
1016         if (!md5sig) {
1017                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1018                 if (!md5sig)
1019                         return -ENOMEM;
1020
1021                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1022                 INIT_HLIST_HEAD(&md5sig->head);
1023                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1024         }
1025
1026         key = sock_kmalloc(sk, sizeof(*key), gfp);
1027         if (!key)
1028                 return -ENOMEM;
1029         if (!tcp_alloc_md5sig_pool()) {
1030                 sock_kfree_s(sk, key, sizeof(*key));
1031                 return -ENOMEM;
1032         }
1033
1034         memcpy(key->key, newkey, newkeylen);
1035         key->keylen = newkeylen;
1036         key->family = family;
1037         key->prefixlen = prefixlen;
1038         memcpy(&key->addr, addr,
1039                (family == AF_INET6) ? sizeof(struct in6_addr) :
1040                                       sizeof(struct in_addr));
1041         hlist_add_head_rcu(&key->node, &md5sig->head);
1042         return 0;
1043 }
1044 EXPORT_SYMBOL(tcp_md5_do_add);
1045
1046 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1047                    u8 prefixlen)
1048 {
1049         struct tcp_md5sig_key *key;
1050
1051         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1052         if (!key)
1053                 return -ENOENT;
1054         hlist_del_rcu(&key->node);
1055         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1056         kfree_rcu(key, rcu);
1057         return 0;
1058 }
1059 EXPORT_SYMBOL(tcp_md5_do_del);
1060
1061 static void tcp_clear_md5_list(struct sock *sk)
1062 {
1063         struct tcp_sock *tp = tcp_sk(sk);
1064         struct tcp_md5sig_key *key;
1065         struct hlist_node *n;
1066         struct tcp_md5sig_info *md5sig;
1067
1068         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1069
1070         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1071                 hlist_del_rcu(&key->node);
1072                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1073                 kfree_rcu(key, rcu);
1074         }
1075 }
1076
1077 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1078                                  char __user *optval, int optlen)
1079 {
1080         struct tcp_md5sig cmd;
1081         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1082         u8 prefixlen = 32;
1083
1084         if (optlen < sizeof(cmd))
1085                 return -EINVAL;
1086
1087         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1088                 return -EFAULT;
1089
1090         if (sin->sin_family != AF_INET)
1091                 return -EINVAL;
1092
1093         if (optname == TCP_MD5SIG_EXT &&
1094             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1095                 prefixlen = cmd.tcpm_prefixlen;
1096                 if (prefixlen > 32)
1097                         return -EINVAL;
1098         }
1099
1100         if (!cmd.tcpm_keylen)
1101                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1102                                       AF_INET, prefixlen);
1103
1104         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1105                 return -EINVAL;
1106
1107         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1108                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1109                               GFP_KERNEL);
1110 }
1111
1112 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1113                                    __be32 daddr, __be32 saddr,
1114                                    const struct tcphdr *th, int nbytes)
1115 {
1116         struct tcp4_pseudohdr *bp;
1117         struct scatterlist sg;
1118         struct tcphdr *_th;
1119
1120         bp = hp->scratch;
1121         bp->saddr = saddr;
1122         bp->daddr = daddr;
1123         bp->pad = 0;
1124         bp->protocol = IPPROTO_TCP;
1125         bp->len = cpu_to_be16(nbytes);
1126
1127         _th = (struct tcphdr *)(bp + 1);
1128         memcpy(_th, th, sizeof(*th));
1129         _th->check = 0;
1130
1131         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1132         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1133                                 sizeof(*bp) + sizeof(*th));
1134         return crypto_ahash_update(hp->md5_req);
1135 }
1136
1137 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1138                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1139 {
1140         struct tcp_md5sig_pool *hp;
1141         struct ahash_request *req;
1142
1143         hp = tcp_get_md5sig_pool();
1144         if (!hp)
1145                 goto clear_hash_noput;
1146         req = hp->md5_req;
1147
1148         if (crypto_ahash_init(req))
1149                 goto clear_hash;
1150         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1151                 goto clear_hash;
1152         if (tcp_md5_hash_key(hp, key))
1153                 goto clear_hash;
1154         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1155         if (crypto_ahash_final(req))
1156                 goto clear_hash;
1157
1158         tcp_put_md5sig_pool();
1159         return 0;
1160
1161 clear_hash:
1162         tcp_put_md5sig_pool();
1163 clear_hash_noput:
1164         memset(md5_hash, 0, 16);
1165         return 1;
1166 }
1167
1168 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1169                         const struct sock *sk,
1170                         const struct sk_buff *skb)
1171 {
1172         struct tcp_md5sig_pool *hp;
1173         struct ahash_request *req;
1174         const struct tcphdr *th = tcp_hdr(skb);
1175         __be32 saddr, daddr;
1176
1177         if (sk) { /* valid for establish/request sockets */
1178                 saddr = sk->sk_rcv_saddr;
1179                 daddr = sk->sk_daddr;
1180         } else {
1181                 const struct iphdr *iph = ip_hdr(skb);
1182                 saddr = iph->saddr;
1183                 daddr = iph->daddr;
1184         }
1185
1186         hp = tcp_get_md5sig_pool();
1187         if (!hp)
1188                 goto clear_hash_noput;
1189         req = hp->md5_req;
1190
1191         if (crypto_ahash_init(req))
1192                 goto clear_hash;
1193
1194         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1195                 goto clear_hash;
1196         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1197                 goto clear_hash;
1198         if (tcp_md5_hash_key(hp, key))
1199                 goto clear_hash;
1200         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1201         if (crypto_ahash_final(req))
1202                 goto clear_hash;
1203
1204         tcp_put_md5sig_pool();
1205         return 0;
1206
1207 clear_hash:
1208         tcp_put_md5sig_pool();
1209 clear_hash_noput:
1210         memset(md5_hash, 0, 16);
1211         return 1;
1212 }
1213 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1214
1215 #endif
1216
1217 /* Called with rcu_read_lock() */
1218 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1219                                     const struct sk_buff *skb)
1220 {
1221 #ifdef CONFIG_TCP_MD5SIG
1222         /*
1223          * This gets called for each TCP segment that arrives
1224          * so we want to be efficient.
1225          * We have 3 drop cases:
1226          * o No MD5 hash and one expected.
1227          * o MD5 hash and we're not expecting one.
1228          * o MD5 hash and its wrong.
1229          */
1230         const __u8 *hash_location = NULL;
1231         struct tcp_md5sig_key *hash_expected;
1232         const struct iphdr *iph = ip_hdr(skb);
1233         const struct tcphdr *th = tcp_hdr(skb);
1234         int genhash;
1235         unsigned char newhash[16];
1236
1237         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1238                                           AF_INET);
1239         hash_location = tcp_parse_md5sig_option(th);
1240
1241         /* We've parsed the options - do we have a hash? */
1242         if (!hash_expected && !hash_location)
1243                 return false;
1244
1245         if (hash_expected && !hash_location) {
1246                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1247                 return true;
1248         }
1249
1250         if (!hash_expected && hash_location) {
1251                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1252                 return true;
1253         }
1254
1255         /* Okay, so this is hash_expected and hash_location -
1256          * so we need to calculate the checksum.
1257          */
1258         genhash = tcp_v4_md5_hash_skb(newhash,
1259                                       hash_expected,
1260                                       NULL, skb);
1261
1262         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1263                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1264                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1265                                      &iph->saddr, ntohs(th->source),
1266                                      &iph->daddr, ntohs(th->dest),
1267                                      genhash ? " tcp_v4_calc_md5_hash failed"
1268                                      : "");
1269                 return true;
1270         }
1271         return false;
1272 #endif
1273         return false;
1274 }
1275
1276 static void tcp_v4_init_req(struct request_sock *req,
1277                             const struct sock *sk_listener,
1278                             struct sk_buff *skb)
1279 {
1280         struct inet_request_sock *ireq = inet_rsk(req);
1281         struct net *net = sock_net(sk_listener);
1282
1283         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1284         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1285         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1286 }
1287
1288 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1289                                           struct flowi *fl,
1290                                           const struct request_sock *req)
1291 {
1292         return inet_csk_route_req(sk, &fl->u.ip4, req);
1293 }
1294
1295 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1296         .family         =       PF_INET,
1297         .obj_size       =       sizeof(struct tcp_request_sock),
1298         .rtx_syn_ack    =       tcp_rtx_synack,
1299         .send_ack       =       tcp_v4_reqsk_send_ack,
1300         .destructor     =       tcp_v4_reqsk_destructor,
1301         .send_reset     =       tcp_v4_send_reset,
1302         .syn_ack_timeout =      tcp_syn_ack_timeout,
1303 };
1304
1305 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1306         .mss_clamp      =       TCP_MSS_DEFAULT,
1307 #ifdef CONFIG_TCP_MD5SIG
1308         .req_md5_lookup =       tcp_v4_md5_lookup,
1309         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1310 #endif
1311         .init_req       =       tcp_v4_init_req,
1312 #ifdef CONFIG_SYN_COOKIES
1313         .cookie_init_seq =      cookie_v4_init_sequence,
1314 #endif
1315         .route_req      =       tcp_v4_route_req,
1316         .init_seq       =       tcp_v4_init_seq,
1317         .init_ts_off    =       tcp_v4_init_ts_off,
1318         .send_synack    =       tcp_v4_send_synack,
1319 };
1320
1321 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1322 {
1323         /* Never answer to SYNs send to broadcast or multicast */
1324         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1325                 goto drop;
1326
1327         return tcp_conn_request(&tcp_request_sock_ops,
1328                                 &tcp_request_sock_ipv4_ops, sk, skb);
1329
1330 drop:
1331         tcp_listendrop(sk);
1332         return 0;
1333 }
1334 EXPORT_SYMBOL(tcp_v4_conn_request);
1335
1336
1337 /*
1338  * The three way handshake has completed - we got a valid synack -
1339  * now create the new socket.
1340  */
1341 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1342                                   struct request_sock *req,
1343                                   struct dst_entry *dst,
1344                                   struct request_sock *req_unhash,
1345                                   bool *own_req)
1346 {
1347         struct inet_request_sock *ireq;
1348         struct inet_sock *newinet;
1349         struct tcp_sock *newtp;
1350         struct sock *newsk;
1351 #ifdef CONFIG_TCP_MD5SIG
1352         struct tcp_md5sig_key *key;
1353 #endif
1354         struct ip_options_rcu *inet_opt;
1355
1356         if (sk_acceptq_is_full(sk))
1357                 goto exit_overflow;
1358
1359         newsk = tcp_create_openreq_child(sk, req, skb);
1360         if (!newsk)
1361                 goto exit_nonewsk;
1362
1363         newsk->sk_gso_type = SKB_GSO_TCPV4;
1364         inet_sk_rx_dst_set(newsk, skb);
1365
1366         newtp                 = tcp_sk(newsk);
1367         newinet               = inet_sk(newsk);
1368         ireq                  = inet_rsk(req);
1369         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1370         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1371         newsk->sk_bound_dev_if = ireq->ir_iif;
1372         newinet->inet_saddr   = ireq->ir_loc_addr;
1373         inet_opt              = rcu_dereference(ireq->ireq_opt);
1374         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1375         newinet->mc_index     = inet_iif(skb);
1376         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1377         newinet->rcv_tos      = ip_hdr(skb)->tos;
1378         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1379         if (inet_opt)
1380                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1381         newinet->inet_id = newtp->write_seq ^ jiffies;
1382
1383         if (!dst) {
1384                 dst = inet_csk_route_child_sock(sk, newsk, req);
1385                 if (!dst)
1386                         goto put_and_exit;
1387         } else {
1388                 /* syncookie case : see end of cookie_v4_check() */
1389         }
1390         sk_setup_caps(newsk, dst);
1391
1392         tcp_ca_openreq_child(newsk, dst);
1393
1394         tcp_sync_mss(newsk, dst_mtu(dst));
1395         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1396
1397         tcp_initialize_rcv_mss(newsk);
1398
1399 #ifdef CONFIG_TCP_MD5SIG
1400         /* Copy over the MD5 key from the original socket */
1401         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1402                                 AF_INET);
1403         if (key) {
1404                 /*
1405                  * We're using one, so create a matching key
1406                  * on the newsk structure. If we fail to get
1407                  * memory, then we end up not copying the key
1408                  * across. Shucks.
1409                  */
1410                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1411                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1412                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1413         }
1414 #endif
1415
1416         if (__inet_inherit_port(sk, newsk) < 0)
1417                 goto put_and_exit;
1418         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1419         if (likely(*own_req)) {
1420                 tcp_move_syn(newtp, req);
1421                 ireq->ireq_opt = NULL;
1422         } else {
1423                 newinet->inet_opt = NULL;
1424         }
1425         return newsk;
1426
1427 exit_overflow:
1428         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1429 exit_nonewsk:
1430         dst_release(dst);
1431 exit:
1432         tcp_listendrop(sk);
1433         return NULL;
1434 put_and_exit:
1435         newinet->inet_opt = NULL;
1436         inet_csk_prepare_forced_close(newsk);
1437         tcp_done(newsk);
1438         goto exit;
1439 }
1440 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1441
1442 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1443 {
1444 #ifdef CONFIG_SYN_COOKIES
1445         const struct tcphdr *th = tcp_hdr(skb);
1446
1447         if (!th->syn)
1448                 sk = cookie_v4_check(sk, skb);
1449 #endif
1450         return sk;
1451 }
1452
1453 /* The socket must have it's spinlock held when we get
1454  * here, unless it is a TCP_LISTEN socket.
1455  *
1456  * We have a potential double-lock case here, so even when
1457  * doing backlog processing we use the BH locking scheme.
1458  * This is because we cannot sleep with the original spinlock
1459  * held.
1460  */
1461 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1462 {
1463         struct sock *rsk;
1464
1465         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1466                 struct dst_entry *dst = sk->sk_rx_dst;
1467
1468                 sock_rps_save_rxhash(sk, skb);
1469                 sk_mark_napi_id(sk, skb);
1470                 if (dst) {
1471                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1472                             !dst->ops->check(dst, 0)) {
1473                                 dst_release(dst);
1474                                 sk->sk_rx_dst = NULL;
1475                         }
1476                 }
1477                 tcp_rcv_established(sk, skb, tcp_hdr(skb));
1478                 return 0;
1479         }
1480
1481         if (tcp_checksum_complete(skb))
1482                 goto csum_err;
1483
1484         if (sk->sk_state == TCP_LISTEN) {
1485                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1486
1487                 if (!nsk)
1488                         goto discard;
1489                 if (nsk != sk) {
1490                         if (tcp_child_process(sk, nsk, skb)) {
1491                                 rsk = nsk;
1492                                 goto reset;
1493                         }
1494                         return 0;
1495                 }
1496         } else
1497                 sock_rps_save_rxhash(sk, skb);
1498
1499         if (tcp_rcv_state_process(sk, skb)) {
1500                 rsk = sk;
1501                 goto reset;
1502         }
1503         return 0;
1504
1505 reset:
1506         tcp_v4_send_reset(rsk, skb);
1507 discard:
1508         kfree_skb(skb);
1509         /* Be careful here. If this function gets more complicated and
1510          * gcc suffers from register pressure on the x86, sk (in %ebx)
1511          * might be destroyed here. This current version compiles correctly,
1512          * but you have been warned.
1513          */
1514         return 0;
1515
1516 csum_err:
1517         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1518         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1519         goto discard;
1520 }
1521 EXPORT_SYMBOL(tcp_v4_do_rcv);
1522
1523 int tcp_v4_early_demux(struct sk_buff *skb)
1524 {
1525         const struct iphdr *iph;
1526         const struct tcphdr *th;
1527         struct sock *sk;
1528
1529         if (skb->pkt_type != PACKET_HOST)
1530                 return 0;
1531
1532         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1533                 return 0;
1534
1535         iph = ip_hdr(skb);
1536         th = tcp_hdr(skb);
1537
1538         if (th->doff < sizeof(struct tcphdr) / 4)
1539                 return 0;
1540
1541         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1542                                        iph->saddr, th->source,
1543                                        iph->daddr, ntohs(th->dest),
1544                                        skb->skb_iif, inet_sdif(skb));
1545         if (sk) {
1546                 skb->sk = sk;
1547                 skb->destructor = sock_edemux;
1548                 if (sk_fullsock(sk)) {
1549                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1550
1551                         if (dst)
1552                                 dst = dst_check(dst, 0);
1553                         if (dst &&
1554                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1555                                 skb_dst_set_noref(skb, dst);
1556                 }
1557         }
1558         return 0;
1559 }
1560
1561 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1562 {
1563         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1564
1565         /* Only socket owner can try to collapse/prune rx queues
1566          * to reduce memory overhead, so add a little headroom here.
1567          * Few sockets backlog are possibly concurrently non empty.
1568          */
1569         limit += 64*1024;
1570
1571         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1572          * we can fix skb->truesize to its real value to avoid future drops.
1573          * This is valid because skb is not yet charged to the socket.
1574          * It has been noticed pure SACK packets were sometimes dropped
1575          * (if cooked by drivers without copybreak feature).
1576          */
1577         skb_condense(skb);
1578
1579         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1580                 bh_unlock_sock(sk);
1581                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1582                 return true;
1583         }
1584         return false;
1585 }
1586 EXPORT_SYMBOL(tcp_add_backlog);
1587
1588 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1589 {
1590         struct tcphdr *th = (struct tcphdr *)skb->data;
1591         unsigned int eaten = skb->len;
1592         int err;
1593
1594         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1595         if (!err) {
1596                 eaten -= skb->len;
1597                 TCP_SKB_CB(skb)->end_seq -= eaten;
1598         }
1599         return err;
1600 }
1601 EXPORT_SYMBOL(tcp_filter);
1602
1603 static void tcp_v4_restore_cb(struct sk_buff *skb)
1604 {
1605         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1606                 sizeof(struct inet_skb_parm));
1607 }
1608
1609 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1610                            const struct tcphdr *th)
1611 {
1612         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1613          * barrier() makes sure compiler wont play fool^Waliasing games.
1614          */
1615         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1616                 sizeof(struct inet_skb_parm));
1617         barrier();
1618
1619         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1620         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1621                                     skb->len - th->doff * 4);
1622         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1623         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1624         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1625         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1626         TCP_SKB_CB(skb)->sacked  = 0;
1627         TCP_SKB_CB(skb)->has_rxtstamp =
1628                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1629 }
1630
1631 /*
1632  *      From tcp_input.c
1633  */
1634
1635 int tcp_v4_rcv(struct sk_buff *skb)
1636 {
1637         struct net *net = dev_net(skb->dev);
1638         int sdif = inet_sdif(skb);
1639         const struct iphdr *iph;
1640         const struct tcphdr *th;
1641         bool refcounted;
1642         struct sock *sk;
1643         int ret;
1644
1645         if (skb->pkt_type != PACKET_HOST)
1646                 goto discard_it;
1647
1648         /* Count it even if it's bad */
1649         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1650
1651         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1652                 goto discard_it;
1653
1654         th = (const struct tcphdr *)skb->data;
1655
1656         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1657                 goto bad_packet;
1658         if (!pskb_may_pull(skb, th->doff * 4))
1659                 goto discard_it;
1660
1661         /* An explanation is required here, I think.
1662          * Packet length and doff are validated by header prediction,
1663          * provided case of th->doff==0 is eliminated.
1664          * So, we defer the checks. */
1665
1666         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1667                 goto csum_error;
1668
1669         th = (const struct tcphdr *)skb->data;
1670         iph = ip_hdr(skb);
1671 lookup:
1672         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1673                                th->dest, sdif, &refcounted);
1674         if (!sk)
1675                 goto no_tcp_socket;
1676
1677 process:
1678         if (sk->sk_state == TCP_TIME_WAIT)
1679                 goto do_time_wait;
1680
1681         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1682                 struct request_sock *req = inet_reqsk(sk);
1683                 bool req_stolen = false;
1684                 struct sock *nsk;
1685
1686                 sk = req->rsk_listener;
1687                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1688                         sk_drops_add(sk, skb);
1689                         reqsk_put(req);
1690                         goto discard_it;
1691                 }
1692                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1693                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1694                         goto lookup;
1695                 }
1696                 /* We own a reference on the listener, increase it again
1697                  * as we might lose it too soon.
1698                  */
1699                 sock_hold(sk);
1700                 refcounted = true;
1701                 nsk = NULL;
1702                 if (!tcp_filter(sk, skb)) {
1703                         th = (const struct tcphdr *)skb->data;
1704                         iph = ip_hdr(skb);
1705                         tcp_v4_fill_cb(skb, iph, th);
1706                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1707                 }
1708                 if (!nsk) {
1709                         reqsk_put(req);
1710                         if (req_stolen) {
1711                                 /* Another cpu got exclusive access to req
1712                                  * and created a full blown socket.
1713                                  * Try to feed this packet to this socket
1714                                  * instead of discarding it.
1715                                  */
1716                                 tcp_v4_restore_cb(skb);
1717                                 sock_put(sk);
1718                                 goto lookup;
1719                         }
1720                         goto discard_and_relse;
1721                 }
1722                 if (nsk == sk) {
1723                         reqsk_put(req);
1724                         tcp_v4_restore_cb(skb);
1725                 } else if (tcp_child_process(sk, nsk, skb)) {
1726                         tcp_v4_send_reset(nsk, skb);
1727                         goto discard_and_relse;
1728                 } else {
1729                         sock_put(sk);
1730                         return 0;
1731                 }
1732         }
1733         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1734                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1735                 goto discard_and_relse;
1736         }
1737
1738         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1739                 goto discard_and_relse;
1740
1741         if (tcp_v4_inbound_md5_hash(sk, skb))
1742                 goto discard_and_relse;
1743
1744         nf_reset(skb);
1745
1746         if (tcp_filter(sk, skb))
1747                 goto discard_and_relse;
1748         th = (const struct tcphdr *)skb->data;
1749         iph = ip_hdr(skb);
1750         tcp_v4_fill_cb(skb, iph, th);
1751
1752         skb->dev = NULL;
1753
1754         if (sk->sk_state == TCP_LISTEN) {
1755                 ret = tcp_v4_do_rcv(sk, skb);
1756                 goto put_and_return;
1757         }
1758
1759         sk_incoming_cpu_update(sk);
1760
1761         bh_lock_sock_nested(sk);
1762         tcp_segs_in(tcp_sk(sk), skb);
1763         ret = 0;
1764         if (!sock_owned_by_user(sk)) {
1765                 ret = tcp_v4_do_rcv(sk, skb);
1766         } else if (tcp_add_backlog(sk, skb)) {
1767                 goto discard_and_relse;
1768         }
1769         bh_unlock_sock(sk);
1770
1771 put_and_return:
1772         if (refcounted)
1773                 sock_put(sk);
1774
1775         return ret;
1776
1777 no_tcp_socket:
1778         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1779                 goto discard_it;
1780
1781         tcp_v4_fill_cb(skb, iph, th);
1782
1783         if (tcp_checksum_complete(skb)) {
1784 csum_error:
1785                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1786 bad_packet:
1787                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1788         } else {
1789                 tcp_v4_send_reset(NULL, skb);
1790         }
1791
1792 discard_it:
1793         /* Discard frame. */
1794         kfree_skb(skb);
1795         return 0;
1796
1797 discard_and_relse:
1798         sk_drops_add(sk, skb);
1799         if (refcounted)
1800                 sock_put(sk);
1801         goto discard_it;
1802
1803 do_time_wait:
1804         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1805                 inet_twsk_put(inet_twsk(sk));
1806                 goto discard_it;
1807         }
1808
1809         tcp_v4_fill_cb(skb, iph, th);
1810
1811         if (tcp_checksum_complete(skb)) {
1812                 inet_twsk_put(inet_twsk(sk));
1813                 goto csum_error;
1814         }
1815         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1816         case TCP_TW_SYN: {
1817                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1818                                                         &tcp_hashinfo, skb,
1819                                                         __tcp_hdrlen(th),
1820                                                         iph->saddr, th->source,
1821                                                         iph->daddr, th->dest,
1822                                                         inet_iif(skb),
1823                                                         sdif);
1824                 if (sk2) {
1825                         inet_twsk_deschedule_put(inet_twsk(sk));
1826                         sk = sk2;
1827                         tcp_v4_restore_cb(skb);
1828                         refcounted = false;
1829                         goto process;
1830                 }
1831         }
1832                 /* to ACK */
1833                 /* fall through */
1834         case TCP_TW_ACK:
1835                 tcp_v4_timewait_ack(sk, skb);
1836                 break;
1837         case TCP_TW_RST:
1838                 tcp_v4_send_reset(sk, skb);
1839                 inet_twsk_deschedule_put(inet_twsk(sk));
1840                 goto discard_it;
1841         case TCP_TW_SUCCESS:;
1842         }
1843         goto discard_it;
1844 }
1845
1846 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1847         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1848         .twsk_unique    = tcp_twsk_unique,
1849         .twsk_destructor= tcp_twsk_destructor,
1850 };
1851
1852 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1853 {
1854         struct dst_entry *dst = skb_dst(skb);
1855
1856         if (dst && dst_hold_safe(dst)) {
1857                 sk->sk_rx_dst = dst;
1858                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1859         }
1860 }
1861 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1862
1863 const struct inet_connection_sock_af_ops ipv4_specific = {
1864         .queue_xmit        = ip_queue_xmit,
1865         .send_check        = tcp_v4_send_check,
1866         .rebuild_header    = inet_sk_rebuild_header,
1867         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1868         .conn_request      = tcp_v4_conn_request,
1869         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1870         .net_header_len    = sizeof(struct iphdr),
1871         .setsockopt        = ip_setsockopt,
1872         .getsockopt        = ip_getsockopt,
1873         .addr2sockaddr     = inet_csk_addr2sockaddr,
1874         .sockaddr_len      = sizeof(struct sockaddr_in),
1875 #ifdef CONFIG_COMPAT
1876         .compat_setsockopt = compat_ip_setsockopt,
1877         .compat_getsockopt = compat_ip_getsockopt,
1878 #endif
1879         .mtu_reduced       = tcp_v4_mtu_reduced,
1880 };
1881 EXPORT_SYMBOL(ipv4_specific);
1882
1883 #ifdef CONFIG_TCP_MD5SIG
1884 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1885         .md5_lookup             = tcp_v4_md5_lookup,
1886         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1887         .md5_parse              = tcp_v4_parse_md5_keys,
1888 };
1889 #endif
1890
1891 /* NOTE: A lot of things set to zero explicitly by call to
1892  *       sk_alloc() so need not be done here.
1893  */
1894 static int tcp_v4_init_sock(struct sock *sk)
1895 {
1896         struct inet_connection_sock *icsk = inet_csk(sk);
1897
1898         tcp_init_sock(sk);
1899
1900         icsk->icsk_af_ops = &ipv4_specific;
1901
1902 #ifdef CONFIG_TCP_MD5SIG
1903         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1904 #endif
1905
1906         return 0;
1907 }
1908
1909 void tcp_v4_destroy_sock(struct sock *sk)
1910 {
1911         struct tcp_sock *tp = tcp_sk(sk);
1912
1913         trace_tcp_destroy_sock(sk);
1914
1915         tcp_clear_xmit_timers(sk);
1916
1917         tcp_cleanup_congestion_control(sk);
1918
1919         tcp_cleanup_ulp(sk);
1920
1921         /* Cleanup up the write buffer. */
1922         tcp_write_queue_purge(sk);
1923
1924         /* Check if we want to disable active TFO */
1925         tcp_fastopen_active_disable_ofo_check(sk);
1926
1927         /* Cleans up our, hopefully empty, out_of_order_queue. */
1928         skb_rbtree_purge(&tp->out_of_order_queue);
1929
1930 #ifdef CONFIG_TCP_MD5SIG
1931         /* Clean up the MD5 key list, if any */
1932         if (tp->md5sig_info) {
1933                 tcp_clear_md5_list(sk);
1934                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1935                 tp->md5sig_info = NULL;
1936         }
1937 #endif
1938
1939         /* Clean up a referenced TCP bind bucket. */
1940         if (inet_csk(sk)->icsk_bind_hash)
1941                 inet_put_port(sk);
1942
1943         BUG_ON(tp->fastopen_rsk);
1944
1945         /* If socket is aborted during connect operation */
1946         tcp_free_fastopen_req(tp);
1947         tcp_fastopen_destroy_cipher(sk);
1948         tcp_saved_syn_free(tp);
1949
1950         sk_sockets_allocated_dec(sk);
1951 }
1952 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1953
1954 #ifdef CONFIG_PROC_FS
1955 /* Proc filesystem TCP sock list dumping. */
1956
1957 /*
1958  * Get next listener socket follow cur.  If cur is NULL, get first socket
1959  * starting from bucket given in st->bucket; when st->bucket is zero the
1960  * very first socket in the hash table is returned.
1961  */
1962 static void *listening_get_next(struct seq_file *seq, void *cur)
1963 {
1964         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
1965         struct tcp_iter_state *st = seq->private;
1966         struct net *net = seq_file_net(seq);
1967         struct inet_listen_hashbucket *ilb;
1968         struct sock *sk = cur;
1969
1970         if (!sk) {
1971 get_head:
1972                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1973                 spin_lock(&ilb->lock);
1974                 sk = sk_head(&ilb->head);
1975                 st->offset = 0;
1976                 goto get_sk;
1977         }
1978         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1979         ++st->num;
1980         ++st->offset;
1981
1982         sk = sk_next(sk);
1983 get_sk:
1984         sk_for_each_from(sk) {
1985                 if (!net_eq(sock_net(sk), net))
1986                         continue;
1987                 if (sk->sk_family == afinfo->family)
1988                         return sk;
1989         }
1990         spin_unlock(&ilb->lock);
1991         st->offset = 0;
1992         if (++st->bucket < INET_LHTABLE_SIZE)
1993                 goto get_head;
1994         return NULL;
1995 }
1996
1997 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1998 {
1999         struct tcp_iter_state *st = seq->private;
2000         void *rc;
2001
2002         st->bucket = 0;
2003         st->offset = 0;
2004         rc = listening_get_next(seq, NULL);
2005
2006         while (rc && *pos) {
2007                 rc = listening_get_next(seq, rc);
2008                 --*pos;
2009         }
2010         return rc;
2011 }
2012
2013 static inline bool empty_bucket(const struct tcp_iter_state *st)
2014 {
2015         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2016 }
2017
2018 /*
2019  * Get first established socket starting from bucket given in st->bucket.
2020  * If st->bucket is zero, the very first socket in the hash is returned.
2021  */
2022 static void *established_get_first(struct seq_file *seq)
2023 {
2024         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2025         struct tcp_iter_state *st = seq->private;
2026         struct net *net = seq_file_net(seq);
2027         void *rc = NULL;
2028
2029         st->offset = 0;
2030         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2031                 struct sock *sk;
2032                 struct hlist_nulls_node *node;
2033                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2034
2035                 /* Lockless fast path for the common case of empty buckets */
2036                 if (empty_bucket(st))
2037                         continue;
2038
2039                 spin_lock_bh(lock);
2040                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2041                         if (sk->sk_family != afinfo->family ||
2042                             !net_eq(sock_net(sk), net)) {
2043                                 continue;
2044                         }
2045                         rc = sk;
2046                         goto out;
2047                 }
2048                 spin_unlock_bh(lock);
2049         }
2050 out:
2051         return rc;
2052 }
2053
2054 static void *established_get_next(struct seq_file *seq, void *cur)
2055 {
2056         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2057         struct sock *sk = cur;
2058         struct hlist_nulls_node *node;
2059         struct tcp_iter_state *st = seq->private;
2060         struct net *net = seq_file_net(seq);
2061
2062         ++st->num;
2063         ++st->offset;
2064
2065         sk = sk_nulls_next(sk);
2066
2067         sk_nulls_for_each_from(sk, node) {
2068                 if (sk->sk_family == afinfo->family &&
2069                     net_eq(sock_net(sk), net))
2070                         return sk;
2071         }
2072
2073         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2074         ++st->bucket;
2075         return established_get_first(seq);
2076 }
2077
2078 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2079 {
2080         struct tcp_iter_state *st = seq->private;
2081         void *rc;
2082
2083         st->bucket = 0;
2084         rc = established_get_first(seq);
2085
2086         while (rc && pos) {
2087                 rc = established_get_next(seq, rc);
2088                 --pos;
2089         }
2090         return rc;
2091 }
2092
2093 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2094 {
2095         void *rc;
2096         struct tcp_iter_state *st = seq->private;
2097
2098         st->state = TCP_SEQ_STATE_LISTENING;
2099         rc        = listening_get_idx(seq, &pos);
2100
2101         if (!rc) {
2102                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2103                 rc        = established_get_idx(seq, pos);
2104         }
2105
2106         return rc;
2107 }
2108
2109 static void *tcp_seek_last_pos(struct seq_file *seq)
2110 {
2111         struct tcp_iter_state *st = seq->private;
2112         int offset = st->offset;
2113         int orig_num = st->num;
2114         void *rc = NULL;
2115
2116         switch (st->state) {
2117         case TCP_SEQ_STATE_LISTENING:
2118                 if (st->bucket >= INET_LHTABLE_SIZE)
2119                         break;
2120                 st->state = TCP_SEQ_STATE_LISTENING;
2121                 rc = listening_get_next(seq, NULL);
2122                 while (offset-- && rc)
2123                         rc = listening_get_next(seq, rc);
2124                 if (rc)
2125                         break;
2126                 st->bucket = 0;
2127                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2128                 /* Fallthrough */
2129         case TCP_SEQ_STATE_ESTABLISHED:
2130                 if (st->bucket > tcp_hashinfo.ehash_mask)
2131                         break;
2132                 rc = established_get_first(seq);
2133                 while (offset-- && rc)
2134                         rc = established_get_next(seq, rc);
2135         }
2136
2137         st->num = orig_num;
2138
2139         return rc;
2140 }
2141
2142 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2143 {
2144         struct tcp_iter_state *st = seq->private;
2145         void *rc;
2146
2147         if (*pos && *pos == st->last_pos) {
2148                 rc = tcp_seek_last_pos(seq);
2149                 if (rc)
2150                         goto out;
2151         }
2152
2153         st->state = TCP_SEQ_STATE_LISTENING;
2154         st->num = 0;
2155         st->bucket = 0;
2156         st->offset = 0;
2157         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2158
2159 out:
2160         st->last_pos = *pos;
2161         return rc;
2162 }
2163 EXPORT_SYMBOL(tcp_seq_start);
2164
2165 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2166 {
2167         struct tcp_iter_state *st = seq->private;
2168         void *rc = NULL;
2169
2170         if (v == SEQ_START_TOKEN) {
2171                 rc = tcp_get_idx(seq, 0);
2172                 goto out;
2173         }
2174
2175         switch (st->state) {
2176         case TCP_SEQ_STATE_LISTENING:
2177                 rc = listening_get_next(seq, v);
2178                 if (!rc) {
2179                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2180                         st->bucket = 0;
2181                         st->offset = 0;
2182                         rc        = established_get_first(seq);
2183                 }
2184                 break;
2185         case TCP_SEQ_STATE_ESTABLISHED:
2186                 rc = established_get_next(seq, v);
2187                 break;
2188         }
2189 out:
2190         ++*pos;
2191         st->last_pos = *pos;
2192         return rc;
2193 }
2194 EXPORT_SYMBOL(tcp_seq_next);
2195
2196 void tcp_seq_stop(struct seq_file *seq, void *v)
2197 {
2198         struct tcp_iter_state *st = seq->private;
2199
2200         switch (st->state) {
2201         case TCP_SEQ_STATE_LISTENING:
2202                 if (v != SEQ_START_TOKEN)
2203                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2204                 break;
2205         case TCP_SEQ_STATE_ESTABLISHED:
2206                 if (v)
2207                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2208                 break;
2209         }
2210 }
2211 EXPORT_SYMBOL(tcp_seq_stop);
2212
2213 static void get_openreq4(const struct request_sock *req,
2214                          struct seq_file *f, int i)
2215 {
2216         const struct inet_request_sock *ireq = inet_rsk(req);
2217         long delta = req->rsk_timer.expires - jiffies;
2218
2219         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2220                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2221                 i,
2222                 ireq->ir_loc_addr,
2223                 ireq->ir_num,
2224                 ireq->ir_rmt_addr,
2225                 ntohs(ireq->ir_rmt_port),
2226                 TCP_SYN_RECV,
2227                 0, 0, /* could print option size, but that is af dependent. */
2228                 1,    /* timers active (only the expire timer) */
2229                 jiffies_delta_to_clock_t(delta),
2230                 req->num_timeout,
2231                 from_kuid_munged(seq_user_ns(f),
2232                                  sock_i_uid(req->rsk_listener)),
2233                 0,  /* non standard timer */
2234                 0, /* open_requests have no inode */
2235                 0,
2236                 req);
2237 }
2238
2239 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2240 {
2241         int timer_active;
2242         unsigned long timer_expires;
2243         const struct tcp_sock *tp = tcp_sk(sk);
2244         const struct inet_connection_sock *icsk = inet_csk(sk);
2245         const struct inet_sock *inet = inet_sk(sk);
2246         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2247         __be32 dest = inet->inet_daddr;
2248         __be32 src = inet->inet_rcv_saddr;
2249         __u16 destp = ntohs(inet->inet_dport);
2250         __u16 srcp = ntohs(inet->inet_sport);
2251         int rx_queue;
2252         int state;
2253
2254         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2255             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2256             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2257                 timer_active    = 1;
2258                 timer_expires   = icsk->icsk_timeout;
2259         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2260                 timer_active    = 4;
2261                 timer_expires   = icsk->icsk_timeout;
2262         } else if (timer_pending(&sk->sk_timer)) {
2263                 timer_active    = 2;
2264                 timer_expires   = sk->sk_timer.expires;
2265         } else {
2266                 timer_active    = 0;
2267                 timer_expires = jiffies;
2268         }
2269
2270         state = inet_sk_state_load(sk);
2271         if (state == TCP_LISTEN)
2272                 rx_queue = sk->sk_ack_backlog;
2273         else
2274                 /* Because we don't lock the socket,
2275                  * we might find a transient negative value.
2276                  */
2277                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2278
2279         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2280                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2281                 i, src, srcp, dest, destp, state,
2282                 tp->write_seq - tp->snd_una,
2283                 rx_queue,
2284                 timer_active,
2285                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2286                 icsk->icsk_retransmits,
2287                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2288                 icsk->icsk_probes_out,
2289                 sock_i_ino(sk),
2290                 refcount_read(&sk->sk_refcnt), sk,
2291                 jiffies_to_clock_t(icsk->icsk_rto),
2292                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2293                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2294                 tp->snd_cwnd,
2295                 state == TCP_LISTEN ?
2296                     fastopenq->max_qlen :
2297                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2298 }
2299
2300 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2301                                struct seq_file *f, int i)
2302 {
2303         long delta = tw->tw_timer.expires - jiffies;
2304         __be32 dest, src;
2305         __u16 destp, srcp;
2306
2307         dest  = tw->tw_daddr;
2308         src   = tw->tw_rcv_saddr;
2309         destp = ntohs(tw->tw_dport);
2310         srcp  = ntohs(tw->tw_sport);
2311
2312         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2313                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2314                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2315                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2316                 refcount_read(&tw->tw_refcnt), tw);
2317 }
2318
2319 #define TMPSZ 150
2320
2321 static int tcp4_seq_show(struct seq_file *seq, void *v)
2322 {
2323         struct tcp_iter_state *st;
2324         struct sock *sk = v;
2325
2326         seq_setwidth(seq, TMPSZ - 1);
2327         if (v == SEQ_START_TOKEN) {
2328                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2329                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2330                            "inode");
2331                 goto out;
2332         }
2333         st = seq->private;
2334
2335         if (sk->sk_state == TCP_TIME_WAIT)
2336                 get_timewait4_sock(v, seq, st->num);
2337         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2338                 get_openreq4(v, seq, st->num);
2339         else
2340                 get_tcp4_sock(v, seq, st->num);
2341 out:
2342         seq_pad(seq, '\n');
2343         return 0;
2344 }
2345
2346 static const struct seq_operations tcp4_seq_ops = {
2347         .show           = tcp4_seq_show,
2348         .start          = tcp_seq_start,
2349         .next           = tcp_seq_next,
2350         .stop           = tcp_seq_stop,
2351 };
2352
2353 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2354         .family         = AF_INET,
2355 };
2356
2357 static int __net_init tcp4_proc_init_net(struct net *net)
2358 {
2359         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2360                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2361                 return -ENOMEM;
2362         return 0;
2363 }
2364
2365 static void __net_exit tcp4_proc_exit_net(struct net *net)
2366 {
2367         remove_proc_entry("tcp", net->proc_net);
2368 }
2369
2370 static struct pernet_operations tcp4_net_ops = {
2371         .init = tcp4_proc_init_net,
2372         .exit = tcp4_proc_exit_net,
2373 };
2374
2375 int __init tcp4_proc_init(void)
2376 {
2377         return register_pernet_subsys(&tcp4_net_ops);
2378 }
2379
2380 void tcp4_proc_exit(void)
2381 {
2382         unregister_pernet_subsys(&tcp4_net_ops);
2383 }
2384 #endif /* CONFIG_PROC_FS */
2385
2386 struct proto tcp_prot = {
2387         .name                   = "TCP",
2388         .owner                  = THIS_MODULE,
2389         .close                  = tcp_close,
2390         .pre_connect            = tcp_v4_pre_connect,
2391         .connect                = tcp_v4_connect,
2392         .disconnect             = tcp_disconnect,
2393         .accept                 = inet_csk_accept,
2394         .ioctl                  = tcp_ioctl,
2395         .init                   = tcp_v4_init_sock,
2396         .destroy                = tcp_v4_destroy_sock,
2397         .shutdown               = tcp_shutdown,
2398         .setsockopt             = tcp_setsockopt,
2399         .getsockopt             = tcp_getsockopt,
2400         .keepalive              = tcp_set_keepalive,
2401         .recvmsg                = tcp_recvmsg,
2402         .sendmsg                = tcp_sendmsg,
2403         .sendpage               = tcp_sendpage,
2404         .backlog_rcv            = tcp_v4_do_rcv,
2405         .release_cb             = tcp_release_cb,
2406         .hash                   = inet_hash,
2407         .unhash                 = inet_unhash,
2408         .get_port               = inet_csk_get_port,
2409         .enter_memory_pressure  = tcp_enter_memory_pressure,
2410         .leave_memory_pressure  = tcp_leave_memory_pressure,
2411         .stream_memory_free     = tcp_stream_memory_free,
2412         .sockets_allocated      = &tcp_sockets_allocated,
2413         .orphan_count           = &tcp_orphan_count,
2414         .memory_allocated       = &tcp_memory_allocated,
2415         .memory_pressure        = &tcp_memory_pressure,
2416         .sysctl_mem             = sysctl_tcp_mem,
2417         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2418         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2419         .max_header             = MAX_TCP_HEADER,
2420         .obj_size               = sizeof(struct tcp_sock),
2421         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2422         .twsk_prot              = &tcp_timewait_sock_ops,
2423         .rsk_prot               = &tcp_request_sock_ops,
2424         .h.hashinfo             = &tcp_hashinfo,
2425         .no_autobind            = true,
2426 #ifdef CONFIG_COMPAT
2427         .compat_setsockopt      = compat_tcp_setsockopt,
2428         .compat_getsockopt      = compat_tcp_getsockopt,
2429 #endif
2430         .diag_destroy           = tcp_abort,
2431 };
2432 EXPORT_SYMBOL(tcp_prot);
2433
2434 static void __net_exit tcp_sk_exit(struct net *net)
2435 {
2436         int cpu;
2437
2438         module_put(net->ipv4.tcp_congestion_control->owner);
2439
2440         for_each_possible_cpu(cpu)
2441                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2442         free_percpu(net->ipv4.tcp_sk);
2443 }
2444
2445 static int __net_init tcp_sk_init(struct net *net)
2446 {
2447         int res, cpu, cnt;
2448
2449         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2450         if (!net->ipv4.tcp_sk)
2451                 return -ENOMEM;
2452
2453         for_each_possible_cpu(cpu) {
2454                 struct sock *sk;
2455
2456                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2457                                            IPPROTO_TCP, net);
2458                 if (res)
2459                         goto fail;
2460                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2461                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2462         }
2463
2464         net->ipv4.sysctl_tcp_ecn = 2;
2465         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2466
2467         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2468         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2469         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2470
2471         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2472         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2473         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2474
2475         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2476         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2477         net->ipv4.sysctl_tcp_syncookies = 1;
2478         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2479         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2480         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2481         net->ipv4.sysctl_tcp_orphan_retries = 0;
2482         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2483         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2484         net->ipv4.sysctl_tcp_tw_reuse = 0;
2485
2486         cnt = tcp_hashinfo.ehash_mask + 1;
2487         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2488         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2489
2490         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2491         net->ipv4.sysctl_tcp_sack = 1;
2492         net->ipv4.sysctl_tcp_window_scaling = 1;
2493         net->ipv4.sysctl_tcp_timestamps = 1;
2494         net->ipv4.sysctl_tcp_early_retrans = 3;
2495         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2496         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2497         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2498         net->ipv4.sysctl_tcp_max_reordering = 300;
2499         net->ipv4.sysctl_tcp_dsack = 1;
2500         net->ipv4.sysctl_tcp_app_win = 31;
2501         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2502         net->ipv4.sysctl_tcp_frto = 2;
2503         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2504         /* This limits the percentage of the congestion window which we
2505          * will allow a single TSO frame to consume.  Building TSO frames
2506          * which are too large can cause TCP streams to be bursty.
2507          */
2508         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2509         /* Default TSQ limit of four TSO segments */
2510         net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2511         /* rfc5961 challenge ack rate limiting */
2512         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2513         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2514         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2515         net->ipv4.sysctl_tcp_autocorking = 1;
2516         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2517         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2518         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2519         if (net != &init_net) {
2520                 memcpy(net->ipv4.sysctl_tcp_rmem,
2521                        init_net.ipv4.sysctl_tcp_rmem,
2522                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2523                 memcpy(net->ipv4.sysctl_tcp_wmem,
2524                        init_net.ipv4.sysctl_tcp_wmem,
2525                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2526         }
2527         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2528         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2529         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2530         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2531
2532         /* Reno is always built in */
2533         if (!net_eq(net, &init_net) &&
2534             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2535                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2536         else
2537                 net->ipv4.tcp_congestion_control = &tcp_reno;
2538
2539         return 0;
2540 fail:
2541         tcp_sk_exit(net);
2542
2543         return res;
2544 }
2545
2546 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2547 {
2548         struct net *net;
2549
2550         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2551
2552         list_for_each_entry(net, net_exit_list, exit_list)
2553                 tcp_fastopen_ctx_destroy(net);
2554 }
2555
2556 static struct pernet_operations __net_initdata tcp_sk_ops = {
2557        .init       = tcp_sk_init,
2558        .exit       = tcp_sk_exit,
2559        .exit_batch = tcp_sk_exit_batch,
2560 };
2561
2562 void __init tcp_v4_init(void)
2563 {
2564         if (register_pernet_subsys(&tcp_sk_ops))
2565                 panic("Failed to create the TCP control socket.\n");
2566 }