net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83
  84 #include <crypto/hash.h>
  85 #include <linux/scatterlist.h>
  86
  87 int sysctl_tcp_low_latency __read_mostly;
  88
  89 #ifdef CONFIG_TCP_MD5SIG
  90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  91                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  92 #endif
  93
  94 struct inet_hashinfo tcp_hashinfo;
  95 EXPORT_SYMBOL(tcp_hashinfo);
  96
  97 static u32 tcp_v4_init_seq_and_tsoff(const struct sk_buff *skb, u32 *tsoff)
  98 {
  99         return secure_tcp_seq_and_tsoff(ip_hdr(skb)->daddr,
 100                                         ip_hdr(skb)->saddr,
 101                                         tcp_hdr(skb)->dest,
 102                                         tcp_hdr(skb)->source, tsoff);
 103 }
 104
 105 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 106 {
 107         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 108         struct tcp_sock *tp = tcp_sk(sk);
 109
 110         /* With PAWS, it is safe from the viewpoint
 111            of data integrity. Even without PAWS it is safe provided sequence
 112            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 113
 114            Actually, the idea is close to VJ's one, only timestamp cache is
 115            held not per host, but per port pair and TW bucket is used as state
 116            holder.
 117
 118            If TW bucket has been already destroyed we fall back to VJ's scheme
 119            and use initial timestamp retrieved from peer table.
 120          */
 121         if (tcptw->tw_ts_recent_stamp &&
 122             (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
 123                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 124                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 125                 if (tp->write_seq == 0)
 126                         tp->write_seq = 1;
 127                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 128                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 129                 sock_hold(sktw);
 130                 return 1;
 131         }
 132
 133         return 0;
 134 }
 135 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 136
 137 /* This will initiate an outgoing connection. */
 138 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 139 {
 140         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 141         struct inet_sock *inet = inet_sk(sk);
 142         struct tcp_sock *tp = tcp_sk(sk);
 143         __be16 orig_sport, orig_dport;
 144         __be32 daddr, nexthop;
 145         struct flowi4 *fl4;
 146         struct rtable *rt;
 147         int err;
 148         u32 seq;
 149         struct ip_options_rcu *inet_opt;
 150         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 151
 152         if (addr_len < sizeof(struct sockaddr_in))
 153                 return -EINVAL;
 154
 155         if (usin->sin_family != AF_INET)
 156                 return -EAFNOSUPPORT;
 157
 158         nexthop = daddr = usin->sin_addr.s_addr;
 159         inet_opt = rcu_dereference_protected(inet->inet_opt,
 160                                              lockdep_sock_is_held(sk));
 161         if (inet_opt && inet_opt->opt.srr) {
 162                 if (!daddr)
 163                         return -EINVAL;
 164                 nexthop = inet_opt->opt.faddr;
 165         }
 166
 167         orig_sport = inet->inet_sport;
 168         orig_dport = usin->sin_port;
 169         fl4 = &inet->cork.fl.u.ip4;
 170         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 171                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 172                               IPPROTO_TCP,
 173                               orig_sport, orig_dport, sk);
 174         if (IS_ERR(rt)) {
 175                 err = PTR_ERR(rt);
 176                 if (err == -ENETUNREACH)
 177                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 178                 return err;
 179         }
 180
 181         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 182                 ip_rt_put(rt);
 183                 return -ENETUNREACH;
 184         }
 185
 186         if (!inet_opt || !inet_opt->opt.srr)
 187                 daddr = fl4->daddr;
 188
 189         if (!inet->inet_saddr)
 190                 inet->inet_saddr = fl4->saddr;
 191         sk_rcv_saddr_set(sk, inet->inet_saddr);
 192
 193         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 194                 /* Reset inherited state */
 195                 tp->rx_opt.ts_recent       = 0;
 196                 tp->rx_opt.ts_recent_stamp = 0;
 197                 if (likely(!tp->repair))
 198                         tp->write_seq      = 0;
 199         }
 200
 201         inet->inet_dport = usin->sin_port;
 202         sk_daddr_set(sk, daddr);
 203
 204         inet_csk(sk)->icsk_ext_hdr_len = 0;
 205         if (inet_opt)
 206                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 207
 208         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 209
 210         /* Socket identity is still unknown (sport may be zero).
 211          * However we set state to SYN-SENT and not releasing socket
 212          * lock select source port, enter ourselves into the hash tables and
 213          * complete initialization after this.
 214          */
 215         tcp_set_state(sk, TCP_SYN_SENT);
 216         err = inet_hash_connect(tcp_death_row, sk);
 217         if (err)
 218                 goto failure;
 219
 220         sk_set_txhash(sk);
 221
 222         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 223                                inet->inet_sport, inet->inet_dport, sk);
 224         if (IS_ERR(rt)) {
 225                 err = PTR_ERR(rt);
 226                 rt = NULL;
 227                 goto failure;
 228         }
 229         /* OK, now commit destination to socket.  */
 230         sk->sk_gso_type = SKB_GSO_TCPV4;
 231         sk_setup_caps(sk, &rt->dst);
 232         rt = NULL;
 233
 234         if (likely(!tp->repair)) {
 235                 seq = secure_tcp_seq_and_tsoff(inet->inet_saddr,
 236                                                inet->inet_daddr,
 237                                                inet->inet_sport,
 238                                                usin->sin_port,
 239                                                &tp->tsoffset);
 240                 if (!tp->write_seq)
 241                         tp->write_seq = seq;
 242         }
 243
 244         inet->inet_id = tp->write_seq ^ jiffies;
 245
 246         if (tcp_fastopen_defer_connect(sk, &err))
 247                 return err;
 248         if (err)
 249                 goto failure;
 250
 251         err = tcp_connect(sk);
 252
 253         if (err)
 254                 goto failure;
 255
 256         return 0;
 257
 258 failure:
 259         /*
 260          * This unhashes the socket and releases the local port,
 261          * if necessary.
 262          */
 263         tcp_set_state(sk, TCP_CLOSE);
 264         ip_rt_put(rt);
 265         sk->sk_route_caps = 0;
 266         inet->inet_dport = 0;
 267         return err;
 268 }
 269 EXPORT_SYMBOL(tcp_v4_connect);
 270
 271 /*
 272  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 273  * It can be called through tcp_release_cb() if socket was owned by user
 274  * at the time tcp_v4_err() was called to handle ICMP message.
 275  */
 276 void tcp_v4_mtu_reduced(struct sock *sk)
 277 {
 278         struct inet_sock *inet = inet_sk(sk);
 279         struct dst_entry *dst;
 280         u32 mtu;
 281
 282         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 283                 return;
 284         mtu = tcp_sk(sk)->mtu_info;
 285         dst = inet_csk_update_pmtu(sk, mtu);
 286         if (!dst)
 287                 return;
 288
 289         /* Something is about to be wrong... Remember soft error
 290          * for the case, if this connection will not able to recover.
 291          */
 292         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 293                 sk->sk_err_soft = EMSGSIZE;
 294
 295         mtu = dst_mtu(dst);
 296
 297         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 298             ip_sk_accept_pmtu(sk) &&
 299             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 300                 tcp_sync_mss(sk, mtu);
 301
 302                 /* Resend the TCP packet because it's
 303                  * clear that the old packet has been
 304                  * dropped. This is the new "fast" path mtu
 305                  * discovery.
 306                  */
 307                 tcp_simple_retransmit(sk);
 308         } /* else let the usual retransmit timer handle it */
 309 }
 310 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 311
 312 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 313 {
 314         struct dst_entry *dst = __sk_dst_check(sk, 0);
 315
 316         if (dst)
 317                 dst->ops->redirect(dst, sk, skb);
 318 }
 319
 320
 321 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 322 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 323 {
 324         struct request_sock *req = inet_reqsk(sk);
 325         struct net *net = sock_net(sk);
 326
 327         /* ICMPs are not backlogged, hence we cannot get
 328          * an established socket here.
 329          */
 330         if (seq != tcp_rsk(req)->snt_isn) {
 331                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 332         } else if (abort) {
 333                 /*
 334                  * Still in SYN_RECV, just remove it silently.
 335                  * There is no good way to pass the error to the newly
 336                  * created socket, and POSIX does not want network
 337                  * errors returned from accept().
 338                  */
 339                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 340                 tcp_listendrop(req->rsk_listener);
 341         }
 342         reqsk_put(req);
 343 }
 344 EXPORT_SYMBOL(tcp_req_err);
 345
 346 /*
 347  * This routine is called by the ICMP module when it gets some
 348  * sort of error condition.  If err < 0 then the socket should
 349  * be closed and the error returned to the user.  If err > 0
 350  * it's just the icmp type << 8 | icmp code.  After adjustment
 351  * header points to the first 8 bytes of the tcp header.  We need
 352  * to find the appropriate port.
 353  *
 354  * The locking strategy used here is very "optimistic". When
 355  * someone else accesses the socket the ICMP is just dropped
 356  * and for some paths there is no check at all.
 357  * A more general error queue to queue errors for later handling
 358  * is probably better.
 359  *
 360  */
 361
 362 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 363 {
 364         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 365         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 366         struct inet_connection_sock *icsk;
 367         struct tcp_sock *tp;
 368         struct inet_sock *inet;
 369         const int type = icmp_hdr(icmp_skb)->type;
 370         const int code = icmp_hdr(icmp_skb)->code;
 371         struct sock *sk;
 372         struct sk_buff *skb;
 373         struct request_sock *fastopen;
 374         __u32 seq, snd_una;
 375         __u32 remaining;
 376         int err;
 377         struct net *net = dev_net(icmp_skb->dev);
 378
 379         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 380                                        th->dest, iph->saddr, ntohs(th->source),
 381                                        inet_iif(icmp_skb));
 382         if (!sk) {
 383                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 384                 return;
 385         }
 386         if (sk->sk_state == TCP_TIME_WAIT) {
 387                 inet_twsk_put(inet_twsk(sk));
 388                 return;
 389         }
 390         seq = ntohl(th->seq);
 391         if (sk->sk_state == TCP_NEW_SYN_RECV)
 392                 return tcp_req_err(sk, seq,
 393                                   type == ICMP_PARAMETERPROB ||
 394                                   type == ICMP_TIME_EXCEEDED ||
 395                                   (type == ICMP_DEST_UNREACH &&
 396                                    (code == ICMP_NET_UNREACH ||
 397                                     code == ICMP_HOST_UNREACH)));
 398
 399         bh_lock_sock(sk);
 400         /* If too many ICMPs get dropped on busy
 401          * servers this needs to be solved differently.
 402          * We do take care of PMTU discovery (RFC1191) special case :
 403          * we can receive locally generated ICMP messages while socket is held.
 404          */
 405         if (sock_owned_by_user(sk)) {
 406                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 407                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 408         }
 409         if (sk->sk_state == TCP_CLOSE)
 410                 goto out;
 411
 412         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 413                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 414                 goto out;
 415         }
 416
 417         icsk = inet_csk(sk);
 418         tp = tcp_sk(sk);
 419         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 420         fastopen = tp->fastopen_rsk;
 421         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 422         if (sk->sk_state != TCP_LISTEN &&
 423             !between(seq, snd_una, tp->snd_nxt)) {
 424                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 425                 goto out;
 426         }
 427
 428         switch (type) {
 429         case ICMP_REDIRECT:
 430                 if (!sock_owned_by_user(sk))
 431                         do_redirect(icmp_skb, sk);
 432                 goto out;
 433         case ICMP_SOURCE_QUENCH:
 434                 /* Just silently ignore these. */
 435                 goto out;
 436         case ICMP_PARAMETERPROB:
 437                 err = EPROTO;
 438                 break;
 439         case ICMP_DEST_UNREACH:
 440                 if (code > NR_ICMP_UNREACH)
 441                         goto out;
 442
 443                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 444                         /* We are not interested in TCP_LISTEN and open_requests
 445                          * (SYN-ACKs send out by Linux are always <576bytes so
 446                          * they should go through unfragmented).
 447                          */
 448                         if (sk->sk_state == TCP_LISTEN)
 449                                 goto out;
 450
 451                         tp->mtu_info = info;
 452                         if (!sock_owned_by_user(sk)) {
 453                                 tcp_v4_mtu_reduced(sk);
 454                         } else {
 455                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 456                                         sock_hold(sk);
 457                         }
 458                         goto out;
 459                 }
 460
 461                 err = icmp_err_convert[code].errno;
 462                 /* check if icmp_skb allows revert of backoff
 463                  * (see draft-zimmermann-tcp-lcd) */
 464                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 465                         break;
 466                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 467                     !icsk->icsk_backoff || fastopen)
 468                         break;
 469
 470                 if (sock_owned_by_user(sk))
 471                         break;
 472
 473                 icsk->icsk_backoff--;
 474                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 475                                                TCP_TIMEOUT_INIT;
 476                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 477
 478                 skb = tcp_write_queue_head(sk);
 479                 BUG_ON(!skb);
 480
 481                 remaining = icsk->icsk_rto -
 482                             min(icsk->icsk_rto,
 483                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 484
 485                 if (remaining) {
 486                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 487                                                   remaining, TCP_RTO_MAX);
 488                 } else {
 489                         /* RTO revert clocked out retransmission.
 490                          * Will retransmit now */
 491                         tcp_retransmit_timer(sk);
 492                 }
 493
 494                 break;
 495         case ICMP_TIME_EXCEEDED:
 496                 err = EHOSTUNREACH;
 497                 break;
 498         default:
 499                 goto out;
 500         }
 501
 502         switch (sk->sk_state) {
 503         case TCP_SYN_SENT:
 504         case TCP_SYN_RECV:
 505                 /* Only in fast or simultaneous open. If a fast open socket is
 506                  * is already accepted it is treated as a connected one below.
 507                  */
 508                 if (fastopen && !fastopen->sk)
 509                         break;
 510
 511                 if (!sock_owned_by_user(sk)) {
 512                         sk->sk_err = err;
 513
 514                         sk->sk_error_report(sk);
 515
 516                         tcp_done(sk);
 517                 } else {
 518                         sk->sk_err_soft = err;
 519                 }
 520                 goto out;
 521         }
 522
 523         /* If we've already connected we will keep trying
 524          * until we time out, or the user gives up.
 525          *
 526          * rfc1122 4.2.3.9 allows to consider as hard errors
 527          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 528          * but it is obsoleted by pmtu discovery).
 529          *
 530          * Note, that in modern internet, where routing is unreliable
 531          * and in each dark corner broken firewalls sit, sending random
 532          * errors ordered by their masters even this two messages finally lose
 533          * their original sense (even Linux sends invalid PORT_UNREACHs)
 534          *
 535          * Now we are in compliance with RFCs.
 536          *                                                      --ANK (980905)
 537          */
 538
 539         inet = inet_sk(sk);
 540         if (!sock_owned_by_user(sk) && inet->recverr) {
 541                 sk->sk_err = err;
 542                 sk->sk_error_report(sk);
 543         } else  { /* Only an error on timeout */
 544                 sk->sk_err_soft = err;
 545         }
 546
 547 out:
 548         bh_unlock_sock(sk);
 549         sock_put(sk);
 550 }
 551
 552 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 553 {
 554         struct tcphdr *th = tcp_hdr(skb);
 555
 556         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 557                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 558                 skb->csum_start = skb_transport_header(skb) - skb->head;
 559                 skb->csum_offset = offsetof(struct tcphdr, check);
 560         } else {
 561                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 562                                          csum_partial(th,
 563                                                       th->doff << 2,
 564                                                       skb->csum));
 565         }
 566 }
 567
 568 /* This routine computes an IPv4 TCP checksum. */
 569 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 570 {
 571         const struct inet_sock *inet = inet_sk(sk);
 572
 573         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 574 }
 575 EXPORT_SYMBOL(tcp_v4_send_check);
 576
 577 /*
 578  *      This routine will send an RST to the other tcp.
 579  *
 580  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 581  *                    for reset.
 582  *      Answer: if a packet caused RST, it is not for a socket
 583  *              existing in our system, if it is matched to a socket,
 584  *              it is just duplicate segment or bug in other side's TCP.
 585  *              So that we build reply only basing on parameters
 586  *              arrived with segment.
 587  *      Exception: precedence violation. We do not implement it in any case.
 588  */
 589
 590 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 591 {
 592         const struct tcphdr *th = tcp_hdr(skb);
 593         struct {
 594                 struct tcphdr th;
 595 #ifdef CONFIG_TCP_MD5SIG
 596                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 597 #endif
 598         } rep;
 599         struct ip_reply_arg arg;
 600 #ifdef CONFIG_TCP_MD5SIG
 601         struct tcp_md5sig_key *key = NULL;
 602         const __u8 *hash_location = NULL;
 603         unsigned char newhash[16];
 604         int genhash;
 605         struct sock *sk1 = NULL;
 606 #endif
 607         struct net *net;
 608
 609         /* Never send a reset in response to a reset. */
 610         if (th->rst)
 611                 return;
 612
 613         /* If sk not NULL, it means we did a successful lookup and incoming
 614          * route had to be correct. prequeue might have dropped our dst.
 615          */
 616         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 617                 return;
 618
 619         /* Swap the send and the receive. */
 620         memset(&rep, 0, sizeof(rep));
 621         rep.th.dest   = th->source;
 622         rep.th.source = th->dest;
 623         rep.th.doff   = sizeof(struct tcphdr) / 4;
 624         rep.th.rst    = 1;
 625
 626         if (th->ack) {
 627                 rep.th.seq = th->ack_seq;
 628         } else {
 629                 rep.th.ack = 1;
 630                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 631                                        skb->len - (th->doff << 2));
 632         }
 633
 634         memset(&arg, 0, sizeof(arg));
 635         arg.iov[0].iov_base = (unsigned char *)&rep;
 636         arg.iov[0].iov_len  = sizeof(rep.th);
 637
 638         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 639 #ifdef CONFIG_TCP_MD5SIG
 640         rcu_read_lock();
 641         hash_location = tcp_parse_md5sig_option(th);
 642         if (sk && sk_fullsock(sk)) {
 643                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 644                                         &ip_hdr(skb)->saddr, AF_INET);
 645         } else if (hash_location) {
 646                 /*
 647                  * active side is lost. Try to find listening socket through
 648                  * source port, and then find md5 key through listening socket.
 649                  * we are not loose security here:
 650                  * Incoming packet is checked with md5 hash with finding key,
 651                  * no RST generated if md5 hash doesn't match.
 652                  */
 653                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 654                                              ip_hdr(skb)->saddr,
 655                                              th->source, ip_hdr(skb)->daddr,
 656                                              ntohs(th->source), inet_iif(skb));
 657                 /* don't send rst if it can't find key */
 658                 if (!sk1)
 659                         goto out;
 660
 661                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 662                                         &ip_hdr(skb)->saddr, AF_INET);
 663                 if (!key)
 664                         goto out;
 665
 666
 667                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 668                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 669                         goto out;
 670
 671         }
 672
 673         if (key) {
 674                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 675                                    (TCPOPT_NOP << 16) |
 676                                    (TCPOPT_MD5SIG << 8) |
 677                                    TCPOLEN_MD5SIG);
 678                 /* Update length and the length the header thinks exists */
 679                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 680                 rep.th.doff = arg.iov[0].iov_len / 4;
 681
 682                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 683                                      key, ip_hdr(skb)->saddr,
 684                                      ip_hdr(skb)->daddr, &rep.th);
 685         }
 686 #endif
 687         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 688                                       ip_hdr(skb)->saddr, /* XXX */
 689                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 690         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 691         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 692
 693         /* When socket is gone, all binding information is lost.
 694          * routing might fail in this case. No choice here, if we choose to force
 695          * input interface, we will misroute in case of asymmetric route.
 696          */
 697         if (sk)
 698                 arg.bound_dev_if = sk->sk_bound_dev_if;
 699
 700         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 701                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 702
 703         arg.tos = ip_hdr(skb)->tos;
 704         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 705         local_bh_disable();
 706         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 707                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 708                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 709                               &arg, arg.iov[0].iov_len);
 710
 711         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 712         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 713         local_bh_enable();
 714
 715 #ifdef CONFIG_TCP_MD5SIG
 716 out:
 717         rcu_read_unlock();
 718 #endif
 719 }
 720
 721 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 722    outside socket context is ugly, certainly. What can I do?
 723  */
 724
 725 static void tcp_v4_send_ack(const struct sock *sk,
 726                             struct sk_buff *skb, u32 seq, u32 ack,
 727                             u32 win, u32 tsval, u32 tsecr, int oif,
 728                             struct tcp_md5sig_key *key,
 729                             int reply_flags, u8 tos)
 730 {
 731         const struct tcphdr *th = tcp_hdr(skb);
 732         struct {
 733                 struct tcphdr th;
 734                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 735 #ifdef CONFIG_TCP_MD5SIG
 736                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 737 #endif
 738                         ];
 739         } rep;
 740         struct net *net = sock_net(sk);
 741         struct ip_reply_arg arg;
 742
 743         memset(&rep.th, 0, sizeof(struct tcphdr));
 744         memset(&arg, 0, sizeof(arg));
 745
 746         arg.iov[0].iov_base = (unsigned char *)&rep;
 747         arg.iov[0].iov_len  = sizeof(rep.th);
 748         if (tsecr) {
 749                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 750                                    (TCPOPT_TIMESTAMP << 8) |
 751                                    TCPOLEN_TIMESTAMP);
 752                 rep.opt[1] = htonl(tsval);
 753                 rep.opt[2] = htonl(tsecr);
 754                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 755         }
 756
 757         /* Swap the send and the receive. */
 758         rep.th.dest    = th->source;
 759         rep.th.source  = th->dest;
 760         rep.th.doff    = arg.iov[0].iov_len / 4;
 761         rep.th.seq     = htonl(seq);
 762         rep.th.ack_seq = htonl(ack);
 763         rep.th.ack     = 1;
 764         rep.th.window  = htons(win);
 765
 766 #ifdef CONFIG_TCP_MD5SIG
 767         if (key) {
 768                 int offset = (tsecr) ? 3 : 0;
 769
 770                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 771                                           (TCPOPT_NOP << 16) |
 772                                           (TCPOPT_MD5SIG << 8) |
 773                                           TCPOLEN_MD5SIG);
 774                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 775                 rep.th.doff = arg.iov[0].iov_len/4;
 776
 777                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 778                                     key, ip_hdr(skb)->saddr,
 779                                     ip_hdr(skb)->daddr, &rep.th);
 780         }
 781 #endif
 782         arg.flags = reply_flags;
 783         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 784                                       ip_hdr(skb)->saddr, /* XXX */
 785                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 786         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 787         if (oif)
 788                 arg.bound_dev_if = oif;
 789         arg.tos = tos;
 790         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 791         local_bh_disable();
 792         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 793                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 794                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 795                               &arg, arg.iov[0].iov_len);
 796
 797         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 798         local_bh_enable();
 799 }
 800
 801 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 802 {
 803         struct inet_timewait_sock *tw = inet_twsk(sk);
 804         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 805
 806         tcp_v4_send_ack(sk, skb,
 807                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 808                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 809                         tcp_time_stamp + tcptw->tw_ts_offset,
 810                         tcptw->tw_ts_recent,
 811                         tw->tw_bound_dev_if,
 812                         tcp_twsk_md5_key(tcptw),
 813                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 814                         tw->tw_tos
 815                         );
 816
 817         inet_twsk_put(tw);
 818 }
 819
 820 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 821                                   struct request_sock *req)
 822 {
 823         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 824          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 825          */
 826         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 827                                              tcp_sk(sk)->snd_nxt;
 828
 829         /* RFC 7323 2.3
 830          * The window field (SEG.WND) of every outgoing segment, with the
 831          * exception of <SYN> segments, MUST be right-shifted by
 832          * Rcv.Wind.Shift bits:
 833          */
 834         tcp_v4_send_ack(sk, skb, seq,
 835                         tcp_rsk(req)->rcv_nxt,
 836                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 837                         tcp_time_stamp + tcp_rsk(req)->ts_off,
 838                         req->ts_recent,
 839                         0,
 840                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 841                                           AF_INET),
 842                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 843                         ip_hdr(skb)->tos);
 844 }
 845
 846 /*
 847  *      Send a SYN-ACK after having received a SYN.
 848  *      This still operates on a request_sock only, not on a big
 849  *      socket.
 850  */
 851 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 852                               struct flowi *fl,
 853                               struct request_sock *req,
 854                               struct tcp_fastopen_cookie *foc,
 855                               enum tcp_synack_type synack_type)
 856 {
 857         const struct inet_request_sock *ireq = inet_rsk(req);
 858         struct flowi4 fl4;
 859         int err = -1;
 860         struct sk_buff *skb;
 861
 862         /* First, grab a route. */
 863         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 864                 return -1;
 865
 866         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 867
 868         if (skb) {
 869                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 870
 871                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 872                                             ireq->ir_rmt_addr,
 873                                             ireq->opt);
 874                 err = net_xmit_eval(err);
 875         }
 876
 877         return err;
 878 }
 879
 880 /*
 881  *      IPv4 request_sock destructor.
 882  */
 883 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 884 {
 885         kfree(inet_rsk(req)->opt);
 886 }
 887
 888 #ifdef CONFIG_TCP_MD5SIG
 889 /*
 890  * RFC2385 MD5 checksumming requires a mapping of
 891  * IP address->MD5 Key.
 892  * We need to maintain these in the sk structure.
 893  */
 894
 895 /* Find the Key structure for an address.  */
 896 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 897                                          const union tcp_md5_addr *addr,
 898                                          int family)
 899 {
 900         const struct tcp_sock *tp = tcp_sk(sk);
 901         struct tcp_md5sig_key *key;
 902         unsigned int size = sizeof(struct in_addr);
 903         const struct tcp_md5sig_info *md5sig;
 904
 905         /* caller either holds rcu_read_lock() or socket lock */
 906         md5sig = rcu_dereference_check(tp->md5sig_info,
 907                                        lockdep_sock_is_held(sk));
 908         if (!md5sig)
 909                 return NULL;
 910 #if IS_ENABLED(CONFIG_IPV6)
 911         if (family == AF_INET6)
 912                 size = sizeof(struct in6_addr);
 913 #endif
 914         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 915                 if (key->family != family)
 916                         continue;
 917                 if (!memcmp(&key->addr, addr, size))
 918                         return key;
 919         }
 920         return NULL;
 921 }
 922 EXPORT_SYMBOL(tcp_md5_do_lookup);
 923
 924 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 925                                          const struct sock *addr_sk)
 926 {
 927         const union tcp_md5_addr *addr;
 928
 929         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 930         return tcp_md5_do_lookup(sk, addr, AF_INET);
 931 }
 932 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 933
 934 /* This can be called on a newly created socket, from other files */
 935 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 936                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 937 {
 938         /* Add Key to the list */
 939         struct tcp_md5sig_key *key;
 940         struct tcp_sock *tp = tcp_sk(sk);
 941         struct tcp_md5sig_info *md5sig;
 942
 943         key = tcp_md5_do_lookup(sk, addr, family);
 944         if (key) {
 945                 /* Pre-existing entry - just update that one. */
 946                 memcpy(key->key, newkey, newkeylen);
 947                 key->keylen = newkeylen;
 948                 return 0;
 949         }
 950
 951         md5sig = rcu_dereference_protected(tp->md5sig_info,
 952                                            lockdep_sock_is_held(sk));
 953         if (!md5sig) {
 954                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 955                 if (!md5sig)
 956                         return -ENOMEM;
 957
 958                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 959                 INIT_HLIST_HEAD(&md5sig->head);
 960                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 961         }
 962
 963         key = sock_kmalloc(sk, sizeof(*key), gfp);
 964         if (!key)
 965                 return -ENOMEM;
 966         if (!tcp_alloc_md5sig_pool()) {
 967                 sock_kfree_s(sk, key, sizeof(*key));
 968                 return -ENOMEM;
 969         }
 970
 971         memcpy(key->key, newkey, newkeylen);
 972         key->keylen = newkeylen;
 973         key->family = family;
 974         memcpy(&key->addr, addr,
 975                (family == AF_INET6) ? sizeof(struct in6_addr) :
 976                                       sizeof(struct in_addr));
 977         hlist_add_head_rcu(&key->node, &md5sig->head);
 978         return 0;
 979 }
 980 EXPORT_SYMBOL(tcp_md5_do_add);
 981
 982 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 983 {
 984         struct tcp_md5sig_key *key;
 985
 986         key = tcp_md5_do_lookup(sk, addr, family);
 987         if (!key)
 988                 return -ENOENT;
 989         hlist_del_rcu(&key->node);
 990         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 991         kfree_rcu(key, rcu);
 992         return 0;
 993 }
 994 EXPORT_SYMBOL(tcp_md5_do_del);
 995
 996 static void tcp_clear_md5_list(struct sock *sk)
 997 {
 998         struct tcp_sock *tp = tcp_sk(sk);
 999         struct tcp_md5sig_key *key;
1000         struct hlist_node *n;
1001         struct tcp_md5sig_info *md5sig;
1002
1003         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1004
1005         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1006                 hlist_del_rcu(&key->node);
1007                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1008                 kfree_rcu(key, rcu);
1009         }
1010 }
1011
1012 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1013                                  int optlen)
1014 {
1015         struct tcp_md5sig cmd;
1016         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1017
1018         if (optlen < sizeof(cmd))
1019                 return -EINVAL;
1020
1021         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1022                 return -EFAULT;
1023
1024         if (sin->sin_family != AF_INET)
1025                 return -EINVAL;
1026
1027         if (!cmd.tcpm_keylen)
1028                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1029                                       AF_INET);
1030
1031         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1032                 return -EINVAL;
1033
1034         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1035                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1036                               GFP_KERNEL);
1037 }
1038
1039 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1040                                    __be32 daddr, __be32 saddr,
1041                                    const struct tcphdr *th, int nbytes)
1042 {
1043         struct tcp4_pseudohdr *bp;
1044         struct scatterlist sg;
1045         struct tcphdr *_th;
1046
1047         bp = hp->scratch;
1048         bp->saddr = saddr;
1049         bp->daddr = daddr;
1050         bp->pad = 0;
1051         bp->protocol = IPPROTO_TCP;
1052         bp->len = cpu_to_be16(nbytes);
1053
1054         _th = (struct tcphdr *)(bp + 1);
1055         memcpy(_th, th, sizeof(*th));
1056         _th->check = 0;
1057
1058         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1059         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1060                                 sizeof(*bp) + sizeof(*th));
1061         return crypto_ahash_update(hp->md5_req);
1062 }
1063
1064 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1065                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1066 {
1067         struct tcp_md5sig_pool *hp;
1068         struct ahash_request *req;
1069
1070         hp = tcp_get_md5sig_pool();
1071         if (!hp)
1072                 goto clear_hash_noput;
1073         req = hp->md5_req;
1074
1075         if (crypto_ahash_init(req))
1076                 goto clear_hash;
1077         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1078                 goto clear_hash;
1079         if (tcp_md5_hash_key(hp, key))
1080                 goto clear_hash;
1081         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1082         if (crypto_ahash_final(req))
1083                 goto clear_hash;
1084
1085         tcp_put_md5sig_pool();
1086         return 0;
1087
1088 clear_hash:
1089         tcp_put_md5sig_pool();
1090 clear_hash_noput:
1091         memset(md5_hash, 0, 16);
1092         return 1;
1093 }
1094
1095 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1096                         const struct sock *sk,
1097                         const struct sk_buff *skb)
1098 {
1099         struct tcp_md5sig_pool *hp;
1100         struct ahash_request *req;
1101         const struct tcphdr *th = tcp_hdr(skb);
1102         __be32 saddr, daddr;
1103
1104         if (sk) { /* valid for establish/request sockets */
1105                 saddr = sk->sk_rcv_saddr;
1106                 daddr = sk->sk_daddr;
1107         } else {
1108                 const struct iphdr *iph = ip_hdr(skb);
1109                 saddr = iph->saddr;
1110                 daddr = iph->daddr;
1111         }
1112
1113         hp = tcp_get_md5sig_pool();
1114         if (!hp)
1115                 goto clear_hash_noput;
1116         req = hp->md5_req;
1117
1118         if (crypto_ahash_init(req))
1119                 goto clear_hash;
1120
1121         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1122                 goto clear_hash;
1123         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1124                 goto clear_hash;
1125         if (tcp_md5_hash_key(hp, key))
1126                 goto clear_hash;
1127         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1128         if (crypto_ahash_final(req))
1129                 goto clear_hash;
1130
1131         tcp_put_md5sig_pool();
1132         return 0;
1133
1134 clear_hash:
1135         tcp_put_md5sig_pool();
1136 clear_hash_noput:
1137         memset(md5_hash, 0, 16);
1138         return 1;
1139 }
1140 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1141
1142 #endif
1143
1144 /* Called with rcu_read_lock() */
1145 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1146                                     const struct sk_buff *skb)
1147 {
1148 #ifdef CONFIG_TCP_MD5SIG
1149         /*
1150          * This gets called for each TCP segment that arrives
1151          * so we want to be efficient.
1152          * We have 3 drop cases:
1153          * o No MD5 hash and one expected.
1154          * o MD5 hash and we're not expecting one.
1155          * o MD5 hash and its wrong.
1156          */
1157         const __u8 *hash_location = NULL;
1158         struct tcp_md5sig_key *hash_expected;
1159         const struct iphdr *iph = ip_hdr(skb);
1160         const struct tcphdr *th = tcp_hdr(skb);
1161         int genhash;
1162         unsigned char newhash[16];
1163
1164         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1165                                           AF_INET);
1166         hash_location = tcp_parse_md5sig_option(th);
1167
1168         /* We've parsed the options - do we have a hash? */
1169         if (!hash_expected && !hash_location)
1170                 return false;
1171
1172         if (hash_expected && !hash_location) {
1173                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1174                 return true;
1175         }
1176
1177         if (!hash_expected && hash_location) {
1178                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1179                 return true;
1180         }
1181
1182         /* Okay, so this is hash_expected and hash_location -
1183          * so we need to calculate the checksum.
1184          */
1185         genhash = tcp_v4_md5_hash_skb(newhash,
1186                                       hash_expected,
1187                                       NULL, skb);
1188
1189         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1190                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1191                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1192                                      &iph->saddr, ntohs(th->source),
1193                                      &iph->daddr, ntohs(th->dest),
1194                                      genhash ? " tcp_v4_calc_md5_hash failed"
1195                                      : "");
1196                 return true;
1197         }
1198         return false;
1199 #endif
1200         return false;
1201 }
1202
1203 static void tcp_v4_init_req(struct request_sock *req,
1204                             const struct sock *sk_listener,
1205                             struct sk_buff *skb)
1206 {
1207         struct inet_request_sock *ireq = inet_rsk(req);
1208
1209         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1210         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1211         ireq->opt = tcp_v4_save_options(skb);
1212 }
1213
1214 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1215                                           struct flowi *fl,
1216                                           const struct request_sock *req)
1217 {
1218         return inet_csk_route_req(sk, &fl->u.ip4, req);
1219 }
1220
1221 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1222         .family         =       PF_INET,
1223         .obj_size       =       sizeof(struct tcp_request_sock),
1224         .rtx_syn_ack    =       tcp_rtx_synack,
1225         .send_ack       =       tcp_v4_reqsk_send_ack,
1226         .destructor     =       tcp_v4_reqsk_destructor,
1227         .send_reset     =       tcp_v4_send_reset,
1228         .syn_ack_timeout =      tcp_syn_ack_timeout,
1229 };
1230
1231 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1232         .mss_clamp      =       TCP_MSS_DEFAULT,
1233 #ifdef CONFIG_TCP_MD5SIG
1234         .req_md5_lookup =       tcp_v4_md5_lookup,
1235         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1236 #endif
1237         .init_req       =       tcp_v4_init_req,
1238 #ifdef CONFIG_SYN_COOKIES
1239         .cookie_init_seq =      cookie_v4_init_sequence,
1240 #endif
1241         .route_req      =       tcp_v4_route_req,
1242         .init_seq_tsoff =       tcp_v4_init_seq_and_tsoff,
1243         .send_synack    =       tcp_v4_send_synack,
1244 };
1245
1246 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1247 {
1248         /* Never answer to SYNs send to broadcast or multicast */
1249         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1250                 goto drop;
1251
1252         return tcp_conn_request(&tcp_request_sock_ops,
1253                                 &tcp_request_sock_ipv4_ops, sk, skb);
1254
1255 drop:
1256         tcp_listendrop(sk);
1257         return 0;
1258 }
1259 EXPORT_SYMBOL(tcp_v4_conn_request);
1260
1261
1262 /*
1263  * The three way handshake has completed - we got a valid synack -
1264  * now create the new socket.
1265  */
1266 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1267                                   struct request_sock *req,
1268                                   struct dst_entry *dst,
1269                                   struct request_sock *req_unhash,
1270                                   bool *own_req)
1271 {
1272         struct inet_request_sock *ireq;
1273         struct inet_sock *newinet;
1274         struct tcp_sock *newtp;
1275         struct sock *newsk;
1276 #ifdef CONFIG_TCP_MD5SIG
1277         struct tcp_md5sig_key *key;
1278 #endif
1279         struct ip_options_rcu *inet_opt;
1280
1281         if (sk_acceptq_is_full(sk))
1282                 goto exit_overflow;
1283
1284         newsk = tcp_create_openreq_child(sk, req, skb);
1285         if (!newsk)
1286                 goto exit_nonewsk;
1287
1288         newsk->sk_gso_type = SKB_GSO_TCPV4;
1289         inet_sk_rx_dst_set(newsk, skb);
1290
1291         newtp                 = tcp_sk(newsk);
1292         newinet               = inet_sk(newsk);
1293         ireq                  = inet_rsk(req);
1294         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1295         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1296         newsk->sk_bound_dev_if = ireq->ir_iif;
1297         newinet->inet_saddr           = ireq->ir_loc_addr;
1298         inet_opt              = ireq->opt;
1299         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1300         ireq->opt             = NULL;
1301         newinet->mc_index     = inet_iif(skb);
1302         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1303         newinet->rcv_tos      = ip_hdr(skb)->tos;
1304         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1305         if (inet_opt)
1306                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1307         newinet->inet_id = newtp->write_seq ^ jiffies;
1308
1309         if (!dst) {
1310                 dst = inet_csk_route_child_sock(sk, newsk, req);
1311                 if (!dst)
1312                         goto put_and_exit;
1313         } else {
1314                 /* syncookie case : see end of cookie_v4_check() */
1315         }
1316         sk_setup_caps(newsk, dst);
1317
1318         tcp_ca_openreq_child(newsk, dst);
1319
1320         tcp_sync_mss(newsk, dst_mtu(dst));
1321         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1322
1323         tcp_initialize_rcv_mss(newsk);
1324
1325 #ifdef CONFIG_TCP_MD5SIG
1326         /* Copy over the MD5 key from the original socket */
1327         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1328                                 AF_INET);
1329         if (key) {
1330                 /*
1331                  * We're using one, so create a matching key
1332                  * on the newsk structure. If we fail to get
1333                  * memory, then we end up not copying the key
1334                  * across. Shucks.
1335                  */
1336                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1337                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1338                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1339         }
1340 #endif
1341
1342         if (__inet_inherit_port(sk, newsk) < 0)
1343                 goto put_and_exit;
1344         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1345         if (*own_req)
1346                 tcp_move_syn(newtp, req);
1347
1348         return newsk;
1349
1350 exit_overflow:
1351         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1352 exit_nonewsk:
1353         dst_release(dst);
1354 exit:
1355         tcp_listendrop(sk);
1356         return NULL;
1357 put_and_exit:
1358         inet_csk_prepare_forced_close(newsk);
1359         tcp_done(newsk);
1360         goto exit;
1361 }
1362 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1363
1364 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1365 {
1366 #ifdef CONFIG_SYN_COOKIES
1367         const struct tcphdr *th = tcp_hdr(skb);
1368
1369         if (!th->syn)
1370                 sk = cookie_v4_check(sk, skb);
1371 #endif
1372         return sk;
1373 }
1374
1375 /* The socket must have it's spinlock held when we get
1376  * here, unless it is a TCP_LISTEN socket.
1377  *
1378  * We have a potential double-lock case here, so even when
1379  * doing backlog processing we use the BH locking scheme.
1380  * This is because we cannot sleep with the original spinlock
1381  * held.
1382  */
1383 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1384 {
1385         struct sock *rsk;
1386
1387         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1388                 struct dst_entry *dst = sk->sk_rx_dst;
1389
1390                 sock_rps_save_rxhash(sk, skb);
1391                 sk_mark_napi_id(sk, skb);
1392                 if (dst) {
1393                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1394                             !dst->ops->check(dst, 0)) {
1395                                 dst_release(dst);
1396                                 sk->sk_rx_dst = NULL;
1397                         }
1398                 }
1399                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1400                 return 0;
1401         }
1402
1403         if (tcp_checksum_complete(skb))
1404                 goto csum_err;
1405
1406         if (sk->sk_state == TCP_LISTEN) {
1407                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1408
1409                 if (!nsk)
1410                         goto discard;
1411                 if (nsk != sk) {
1412                         sock_rps_save_rxhash(nsk, skb);
1413                         sk_mark_napi_id(nsk, skb);
1414                         if (tcp_child_process(sk, nsk, skb)) {
1415                                 rsk = nsk;
1416                                 goto reset;
1417                         }
1418                         return 0;
1419                 }
1420         } else
1421                 sock_rps_save_rxhash(sk, skb);
1422
1423         if (tcp_rcv_state_process(sk, skb)) {
1424                 rsk = sk;
1425                 goto reset;
1426         }
1427         return 0;
1428
1429 reset:
1430         tcp_v4_send_reset(rsk, skb);
1431 discard:
1432         kfree_skb(skb);
1433         /* Be careful here. If this function gets more complicated and
1434          * gcc suffers from register pressure on the x86, sk (in %ebx)
1435          * might be destroyed here. This current version compiles correctly,
1436          * but you have been warned.
1437          */
1438         return 0;
1439
1440 csum_err:
1441         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1442         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1443         goto discard;
1444 }
1445 EXPORT_SYMBOL(tcp_v4_do_rcv);
1446
1447 void tcp_v4_early_demux(struct sk_buff *skb)
1448 {
1449         const struct iphdr *iph;
1450         const struct tcphdr *th;
1451         struct sock *sk;
1452
1453         if (skb->pkt_type != PACKET_HOST)
1454                 return;
1455
1456         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1457                 return;
1458
1459         iph = ip_hdr(skb);
1460         th = tcp_hdr(skb);
1461
1462         if (th->doff < sizeof(struct tcphdr) / 4)
1463                 return;
1464
1465         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1466                                        iph->saddr, th->source,
1467                                        iph->daddr, ntohs(th->dest),
1468                                        skb->skb_iif);
1469         if (sk) {
1470                 skb->sk = sk;
1471                 skb->destructor = sock_edemux;
1472                 if (sk_fullsock(sk)) {
1473                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1474
1475                         if (dst)
1476                                 dst = dst_check(dst, 0);
1477                         if (dst &&
1478                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1479                                 skb_dst_set_noref(skb, dst);
1480                 }
1481         }
1482 }
1483
1484 /* Packet is added to VJ-style prequeue for processing in process
1485  * context, if a reader task is waiting. Apparently, this exciting
1486  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1487  * failed somewhere. Latency? Burstiness? Well, at least now we will
1488  * see, why it failed. 8)8)                               --ANK
1489  *
1490  */
1491 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1492 {
1493         struct tcp_sock *tp = tcp_sk(sk);
1494
1495         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1496                 return false;
1497
1498         if (skb->len <= tcp_hdrlen(skb) &&
1499             skb_queue_len(&tp->ucopy.prequeue) == 0)
1500                 return false;
1501
1502         /* Before escaping RCU protected region, we need to take care of skb
1503          * dst. Prequeue is only enabled for established sockets.
1504          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1505          * Instead of doing full sk_rx_dst validity here, let's perform
1506          * an optimistic check.
1507          */
1508         if (likely(sk->sk_rx_dst))
1509                 skb_dst_drop(skb);
1510         else
1511                 skb_dst_force_safe(skb);
1512
1513         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1514         tp->ucopy.memory += skb->truesize;
1515         if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1516             tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1517                 struct sk_buff *skb1;
1518
1519                 BUG_ON(sock_owned_by_user(sk));
1520                 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1521                                 skb_queue_len(&tp->ucopy.prequeue));
1522
1523                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1524                         sk_backlog_rcv(sk, skb1);
1525
1526                 tp->ucopy.memory = 0;
1527         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1528                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1529                                            POLLIN | POLLRDNORM | POLLRDBAND);
1530                 if (!inet_csk_ack_scheduled(sk))
1531                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1532                                                   (3 * tcp_rto_min(sk)) / 4,
1533                                                   TCP_RTO_MAX);
1534         }
1535         return true;
1536 }
1537 EXPORT_SYMBOL(tcp_prequeue);
1538
1539 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1540 {
1541         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1542
1543         /* Only socket owner can try to collapse/prune rx queues
1544          * to reduce memory overhead, so add a little headroom here.
1545          * Few sockets backlog are possibly concurrently non empty.
1546          */
1547         limit += 64*1024;
1548
1549         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1550          * we can fix skb->truesize to its real value to avoid future drops.
1551          * This is valid because skb is not yet charged to the socket.
1552          * It has been noticed pure SACK packets were sometimes dropped
1553          * (if cooked by drivers without copybreak feature).
1554          */
1555         skb_condense(skb);
1556
1557         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1558                 bh_unlock_sock(sk);
1559                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1560                 return true;
1561         }
1562         return false;
1563 }
1564 EXPORT_SYMBOL(tcp_add_backlog);
1565
1566 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1567 {
1568         struct tcphdr *th = (struct tcphdr *)skb->data;
1569         unsigned int eaten = skb->len;
1570         int err;
1571
1572         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1573         if (!err) {
1574                 eaten -= skb->len;
1575                 TCP_SKB_CB(skb)->end_seq -= eaten;
1576         }
1577         return err;
1578 }
1579 EXPORT_SYMBOL(tcp_filter);
1580
1581 /*
1582  *      From tcp_input.c
1583  */
1584
1585 int tcp_v4_rcv(struct sk_buff *skb)
1586 {
1587         struct net *net = dev_net(skb->dev);
1588         const struct iphdr *iph;
1589         const struct tcphdr *th;
1590         bool refcounted;
1591         struct sock *sk;
1592         int ret;
1593
1594         if (skb->pkt_type != PACKET_HOST)
1595                 goto discard_it;
1596
1597         /* Count it even if it's bad */
1598         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1599
1600         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1601                 goto discard_it;
1602
1603         th = (const struct tcphdr *)skb->data;
1604
1605         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1606                 goto bad_packet;
1607         if (!pskb_may_pull(skb, th->doff * 4))
1608                 goto discard_it;
1609
1610         /* An explanation is required here, I think.
1611          * Packet length and doff are validated by header prediction,
1612          * provided case of th->doff==0 is eliminated.
1613          * So, we defer the checks. */
1614
1615         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1616                 goto csum_error;
1617
1618         th = (const struct tcphdr *)skb->data;
1619         iph = ip_hdr(skb);
1620         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1621          * barrier() makes sure compiler wont play fool^Waliasing games.
1622          */
1623         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1624                 sizeof(struct inet_skb_parm));
1625         barrier();
1626
1627         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1628         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1629                                     skb->len - th->doff * 4);
1630         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1631         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1632         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1633         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1634         TCP_SKB_CB(skb)->sacked  = 0;
1635
1636 lookup:
1637         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1638                                th->dest, &refcounted);
1639         if (!sk)
1640                 goto no_tcp_socket;
1641
1642 process:
1643         if (sk->sk_state == TCP_TIME_WAIT)
1644                 goto do_time_wait;
1645
1646         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1647                 struct request_sock *req = inet_reqsk(sk);
1648                 struct sock *nsk;
1649
1650                 sk = req->rsk_listener;
1651                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1652                         sk_drops_add(sk, skb);
1653                         reqsk_put(req);
1654                         goto discard_it;
1655                 }
1656                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1657                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1658                         goto lookup;
1659                 }
1660                 /* We own a reference on the listener, increase it again
1661                  * as we might lose it too soon.
1662                  */
1663                 sock_hold(sk);
1664                 refcounted = true;
1665                 nsk = tcp_check_req(sk, skb, req, false);
1666                 if (!nsk) {
1667                         reqsk_put(req);
1668                         goto discard_and_relse;
1669                 }
1670                 if (nsk == sk) {
1671                         reqsk_put(req);
1672                 } else if (tcp_child_process(sk, nsk, skb)) {
1673                         tcp_v4_send_reset(nsk, skb);
1674                         goto discard_and_relse;
1675                 } else {
1676                         sock_put(sk);
1677                         return 0;
1678                 }
1679         }
1680         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1681                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1682                 goto discard_and_relse;
1683         }
1684
1685         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1686                 goto discard_and_relse;
1687
1688         if (tcp_v4_inbound_md5_hash(sk, skb))
1689                 goto discard_and_relse;
1690
1691         nf_reset(skb);
1692
1693         if (tcp_filter(sk, skb))
1694                 goto discard_and_relse;
1695         th = (const struct tcphdr *)skb->data;
1696         iph = ip_hdr(skb);
1697
1698         skb->dev = NULL;
1699
1700         if (sk->sk_state == TCP_LISTEN) {
1701                 ret = tcp_v4_do_rcv(sk, skb);
1702                 goto put_and_return;
1703         }
1704
1705         sk_incoming_cpu_update(sk);
1706
1707         bh_lock_sock_nested(sk);
1708         tcp_segs_in(tcp_sk(sk), skb);
1709         ret = 0;
1710         if (!sock_owned_by_user(sk)) {
1711                 if (!tcp_prequeue(sk, skb))
1712                         ret = tcp_v4_do_rcv(sk, skb);
1713         } else if (tcp_add_backlog(sk, skb)) {
1714                 goto discard_and_relse;
1715         }
1716         bh_unlock_sock(sk);
1717
1718 put_and_return:
1719         if (refcounted)
1720                 sock_put(sk);
1721
1722         return ret;
1723
1724 no_tcp_socket:
1725         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1726                 goto discard_it;
1727
1728         if (tcp_checksum_complete(skb)) {
1729 csum_error:
1730                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1731 bad_packet:
1732                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1733         } else {
1734                 tcp_v4_send_reset(NULL, skb);
1735         }
1736
1737 discard_it:
1738         /* Discard frame. */
1739         kfree_skb(skb);
1740         return 0;
1741
1742 discard_and_relse:
1743         sk_drops_add(sk, skb);
1744         if (refcounted)
1745                 sock_put(sk);
1746         goto discard_it;
1747
1748 do_time_wait:
1749         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1750                 inet_twsk_put(inet_twsk(sk));
1751                 goto discard_it;
1752         }
1753
1754         if (tcp_checksum_complete(skb)) {
1755                 inet_twsk_put(inet_twsk(sk));
1756                 goto csum_error;
1757         }
1758         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1759         case TCP_TW_SYN: {
1760                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1761                                                         &tcp_hashinfo, skb,
1762                                                         __tcp_hdrlen(th),
1763                                                         iph->saddr, th->source,
1764                                                         iph->daddr, th->dest,
1765                                                         inet_iif(skb));
1766                 if (sk2) {
1767                         inet_twsk_deschedule_put(inet_twsk(sk));
1768                         sk = sk2;
1769                         refcounted = false;
1770                         goto process;
1771                 }
1772                 /* Fall through to ACK */
1773         }
1774         case TCP_TW_ACK:
1775                 tcp_v4_timewait_ack(sk, skb);
1776                 break;
1777         case TCP_TW_RST:
1778                 tcp_v4_send_reset(sk, skb);
1779                 inet_twsk_deschedule_put(inet_twsk(sk));
1780                 goto discard_it;
1781         case TCP_TW_SUCCESS:;
1782         }
1783         goto discard_it;
1784 }
1785
1786 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1787         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1788         .twsk_unique    = tcp_twsk_unique,
1789         .twsk_destructor= tcp_twsk_destructor,
1790 };
1791
1792 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1793 {
1794         struct dst_entry *dst = skb_dst(skb);
1795
1796         if (dst && dst_hold_safe(dst)) {
1797                 sk->sk_rx_dst = dst;
1798                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1799         }
1800 }
1801 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1802
1803 const struct inet_connection_sock_af_ops ipv4_specific = {
1804         .queue_xmit        = ip_queue_xmit,
1805         .send_check        = tcp_v4_send_check,
1806         .rebuild_header    = inet_sk_rebuild_header,
1807         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1808         .conn_request      = tcp_v4_conn_request,
1809         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1810         .net_header_len    = sizeof(struct iphdr),
1811         .setsockopt        = ip_setsockopt,
1812         .getsockopt        = ip_getsockopt,
1813         .addr2sockaddr     = inet_csk_addr2sockaddr,
1814         .sockaddr_len      = sizeof(struct sockaddr_in),
1815 #ifdef CONFIG_COMPAT
1816         .compat_setsockopt = compat_ip_setsockopt,
1817         .compat_getsockopt = compat_ip_getsockopt,
1818 #endif
1819         .mtu_reduced       = tcp_v4_mtu_reduced,
1820 };
1821 EXPORT_SYMBOL(ipv4_specific);
1822
1823 #ifdef CONFIG_TCP_MD5SIG
1824 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1825         .md5_lookup             = tcp_v4_md5_lookup,
1826         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1827         .md5_parse              = tcp_v4_parse_md5_keys,
1828 };
1829 #endif
1830
1831 /* NOTE: A lot of things set to zero explicitly by call to
1832  *       sk_alloc() so need not be done here.
1833  */
1834 static int tcp_v4_init_sock(struct sock *sk)
1835 {
1836         struct inet_connection_sock *icsk = inet_csk(sk);
1837
1838         tcp_init_sock(sk);
1839
1840         icsk->icsk_af_ops = &ipv4_specific;
1841
1842 #ifdef CONFIG_TCP_MD5SIG
1843         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1844 #endif
1845
1846         return 0;
1847 }
1848
1849 void tcp_v4_destroy_sock(struct sock *sk)
1850 {
1851         struct tcp_sock *tp = tcp_sk(sk);
1852
1853         tcp_clear_xmit_timers(sk);
1854
1855         tcp_cleanup_congestion_control(sk);
1856
1857         /* Cleanup up the write buffer. */
1858         tcp_write_queue_purge(sk);
1859
1860         /* Cleans up our, hopefully empty, out_of_order_queue. */
1861         skb_rbtree_purge(&tp->out_of_order_queue);
1862
1863 #ifdef CONFIG_TCP_MD5SIG
1864         /* Clean up the MD5 key list, if any */
1865         if (tp->md5sig_info) {
1866                 tcp_clear_md5_list(sk);
1867                 kfree_rcu(tp->md5sig_info, rcu);
1868                 tp->md5sig_info = NULL;
1869         }
1870 #endif
1871
1872         /* Clean prequeue, it must be empty really */
1873         __skb_queue_purge(&tp->ucopy.prequeue);
1874
1875         /* Clean up a referenced TCP bind bucket. */
1876         if (inet_csk(sk)->icsk_bind_hash)
1877                 inet_put_port(sk);
1878
1879         BUG_ON(tp->fastopen_rsk);
1880
1881         /* If socket is aborted during connect operation */
1882         tcp_free_fastopen_req(tp);
1883         tcp_saved_syn_free(tp);
1884
1885         sk_sockets_allocated_dec(sk);
1886 }
1887 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1888
1889 #ifdef CONFIG_PROC_FS
1890 /* Proc filesystem TCP sock list dumping. */
1891
1892 /*
1893  * Get next listener socket follow cur.  If cur is NULL, get first socket
1894  * starting from bucket given in st->bucket; when st->bucket is zero the
1895  * very first socket in the hash table is returned.
1896  */
1897 static void *listening_get_next(struct seq_file *seq, void *cur)
1898 {
1899         struct tcp_iter_state *st = seq->private;
1900         struct net *net = seq_file_net(seq);
1901         struct inet_listen_hashbucket *ilb;
1902         struct sock *sk = cur;
1903
1904         if (!sk) {
1905 get_head:
1906                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1907                 spin_lock(&ilb->lock);
1908                 sk = sk_head(&ilb->head);
1909                 st->offset = 0;
1910                 goto get_sk;
1911         }
1912         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1913         ++st->num;
1914         ++st->offset;
1915
1916         sk = sk_next(sk);
1917 get_sk:
1918         sk_for_each_from(sk) {
1919                 if (!net_eq(sock_net(sk), net))
1920                         continue;
1921                 if (sk->sk_family == st->family)
1922                         return sk;
1923         }
1924         spin_unlock(&ilb->lock);
1925         st->offset = 0;
1926         if (++st->bucket < INET_LHTABLE_SIZE)
1927                 goto get_head;
1928         return NULL;
1929 }
1930
1931 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1932 {
1933         struct tcp_iter_state *st = seq->private;
1934         void *rc;
1935
1936         st->bucket = 0;
1937         st->offset = 0;
1938         rc = listening_get_next(seq, NULL);
1939
1940         while (rc && *pos) {
1941                 rc = listening_get_next(seq, rc);
1942                 --*pos;
1943         }
1944         return rc;
1945 }
1946
1947 static inline bool empty_bucket(const struct tcp_iter_state *st)
1948 {
1949         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1950 }
1951
1952 /*
1953  * Get first established socket starting from bucket given in st->bucket.
1954  * If st->bucket is zero, the very first socket in the hash is returned.
1955  */
1956 static void *established_get_first(struct seq_file *seq)
1957 {
1958         struct tcp_iter_state *st = seq->private;
1959         struct net *net = seq_file_net(seq);
1960         void *rc = NULL;
1961
1962         st->offset = 0;
1963         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1964                 struct sock *sk;
1965                 struct hlist_nulls_node *node;
1966                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1967
1968                 /* Lockless fast path for the common case of empty buckets */
1969                 if (empty_bucket(st))
1970                         continue;
1971
1972                 spin_lock_bh(lock);
1973                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1974                         if (sk->sk_family != st->family ||
1975                             !net_eq(sock_net(sk), net)) {
1976                                 continue;
1977                         }
1978                         rc = sk;
1979                         goto out;
1980                 }
1981                 spin_unlock_bh(lock);
1982         }
1983 out:
1984         return rc;
1985 }
1986
1987 static void *established_get_next(struct seq_file *seq, void *cur)
1988 {
1989         struct sock *sk = cur;
1990         struct hlist_nulls_node *node;
1991         struct tcp_iter_state *st = seq->private;
1992         struct net *net = seq_file_net(seq);
1993
1994         ++st->num;
1995         ++st->offset;
1996
1997         sk = sk_nulls_next(sk);
1998
1999         sk_nulls_for_each_from(sk, node) {
2000                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2001                         return sk;
2002         }
2003
2004         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2005         ++st->bucket;
2006         return established_get_first(seq);
2007 }
2008
2009 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2010 {
2011         struct tcp_iter_state *st = seq->private;
2012         void *rc;
2013
2014         st->bucket = 0;
2015         rc = established_get_first(seq);
2016
2017         while (rc && pos) {
2018                 rc = established_get_next(seq, rc);
2019                 --pos;
2020         }
2021         return rc;
2022 }
2023
2024 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2025 {
2026         void *rc;
2027         struct tcp_iter_state *st = seq->private;
2028
2029         st->state = TCP_SEQ_STATE_LISTENING;
2030         rc        = listening_get_idx(seq, &pos);
2031
2032         if (!rc) {
2033                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2034                 rc        = established_get_idx(seq, pos);
2035         }
2036
2037         return rc;
2038 }
2039
2040 static void *tcp_seek_last_pos(struct seq_file *seq)
2041 {
2042         struct tcp_iter_state *st = seq->private;
2043         int offset = st->offset;
2044         int orig_num = st->num;
2045         void *rc = NULL;
2046
2047         switch (st->state) {
2048         case TCP_SEQ_STATE_LISTENING:
2049                 if (st->bucket >= INET_LHTABLE_SIZE)
2050                         break;
2051                 st->state = TCP_SEQ_STATE_LISTENING;
2052                 rc = listening_get_next(seq, NULL);
2053                 while (offset-- && rc)
2054                         rc = listening_get_next(seq, rc);
2055                 if (rc)
2056                         break;
2057                 st->bucket = 0;
2058                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2059                 /* Fallthrough */
2060         case TCP_SEQ_STATE_ESTABLISHED:
2061                 if (st->bucket > tcp_hashinfo.ehash_mask)
2062                         break;
2063                 rc = established_get_first(seq);
2064                 while (offset-- && rc)
2065                         rc = established_get_next(seq, rc);
2066         }
2067
2068         st->num = orig_num;
2069
2070         return rc;
2071 }
2072
2073 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2074 {
2075         struct tcp_iter_state *st = seq->private;
2076         void *rc;
2077
2078         if (*pos && *pos == st->last_pos) {
2079                 rc = tcp_seek_last_pos(seq);
2080                 if (rc)
2081                         goto out;
2082         }
2083
2084         st->state = TCP_SEQ_STATE_LISTENING;
2085         st->num = 0;
2086         st->bucket = 0;
2087         st->offset = 0;
2088         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2089
2090 out:
2091         st->last_pos = *pos;
2092         return rc;
2093 }
2094
2095 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2096 {
2097         struct tcp_iter_state *st = seq->private;
2098         void *rc = NULL;
2099
2100         if (v == SEQ_START_TOKEN) {
2101                 rc = tcp_get_idx(seq, 0);
2102                 goto out;
2103         }
2104
2105         switch (st->state) {
2106         case TCP_SEQ_STATE_LISTENING:
2107                 rc = listening_get_next(seq, v);
2108                 if (!rc) {
2109                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2110                         st->bucket = 0;
2111                         st->offset = 0;
2112                         rc        = established_get_first(seq);
2113                 }
2114                 break;
2115         case TCP_SEQ_STATE_ESTABLISHED:
2116                 rc = established_get_next(seq, v);
2117                 break;
2118         }
2119 out:
2120         ++*pos;
2121         st->last_pos = *pos;
2122         return rc;
2123 }
2124
2125 static void tcp_seq_stop(struct seq_file *seq, void *v)
2126 {
2127         struct tcp_iter_state *st = seq->private;
2128
2129         switch (st->state) {
2130         case TCP_SEQ_STATE_LISTENING:
2131                 if (v != SEQ_START_TOKEN)
2132                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2133                 break;
2134         case TCP_SEQ_STATE_ESTABLISHED:
2135                 if (v)
2136                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2137                 break;
2138         }
2139 }
2140
2141 int tcp_seq_open(struct inode *inode, struct file *file)
2142 {
2143         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2144         struct tcp_iter_state *s;
2145         int err;
2146
2147         err = seq_open_net(inode, file, &afinfo->seq_ops,
2148                           sizeof(struct tcp_iter_state));
2149         if (err < 0)
2150                 return err;
2151
2152         s = ((struct seq_file *)file->private_data)->private;
2153         s->family               = afinfo->family;
2154         s->last_pos             = 0;
2155         return 0;
2156 }
2157 EXPORT_SYMBOL(tcp_seq_open);
2158
2159 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2160 {
2161         int rc = 0;
2162         struct proc_dir_entry *p;
2163
2164         afinfo->seq_ops.start           = tcp_seq_start;
2165         afinfo->seq_ops.next            = tcp_seq_next;
2166         afinfo->seq_ops.stop            = tcp_seq_stop;
2167
2168         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2169                              afinfo->seq_fops, afinfo);
2170         if (!p)
2171                 rc = -ENOMEM;
2172         return rc;
2173 }
2174 EXPORT_SYMBOL(tcp_proc_register);
2175
2176 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2177 {
2178         remove_proc_entry(afinfo->name, net->proc_net);
2179 }
2180 EXPORT_SYMBOL(tcp_proc_unregister);
2181
2182 static void get_openreq4(const struct request_sock *req,
2183                          struct seq_file *f, int i)
2184 {
2185         const struct inet_request_sock *ireq = inet_rsk(req);
2186         long delta = req->rsk_timer.expires - jiffies;
2187
2188         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2189                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2190                 i,
2191                 ireq->ir_loc_addr,
2192                 ireq->ir_num,
2193                 ireq->ir_rmt_addr,
2194                 ntohs(ireq->ir_rmt_port),
2195                 TCP_SYN_RECV,
2196                 0, 0, /* could print option size, but that is af dependent. */
2197                 1,    /* timers active (only the expire timer) */
2198                 jiffies_delta_to_clock_t(delta),
2199                 req->num_timeout,
2200                 from_kuid_munged(seq_user_ns(f),
2201                                  sock_i_uid(req->rsk_listener)),
2202                 0,  /* non standard timer */
2203                 0, /* open_requests have no inode */
2204                 0,
2205                 req);
2206 }
2207
2208 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2209 {
2210         int timer_active;
2211         unsigned long timer_expires;
2212         const struct tcp_sock *tp = tcp_sk(sk);
2213         const struct inet_connection_sock *icsk = inet_csk(sk);
2214         const struct inet_sock *inet = inet_sk(sk);
2215         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2216         __be32 dest = inet->inet_daddr;
2217         __be32 src = inet->inet_rcv_saddr;
2218         __u16 destp = ntohs(inet->inet_dport);
2219         __u16 srcp = ntohs(inet->inet_sport);
2220         int rx_queue;
2221         int state;
2222
2223         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2224             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2225             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2226                 timer_active    = 1;
2227                 timer_expires   = icsk->icsk_timeout;
2228         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2229                 timer_active    = 4;
2230                 timer_expires   = icsk->icsk_timeout;
2231         } else if (timer_pending(&sk->sk_timer)) {
2232                 timer_active    = 2;
2233                 timer_expires   = sk->sk_timer.expires;
2234         } else {
2235                 timer_active    = 0;
2236                 timer_expires = jiffies;
2237         }
2238
2239         state = sk_state_load(sk);
2240         if (state == TCP_LISTEN)
2241                 rx_queue = sk->sk_ack_backlog;
2242         else
2243                 /* Because we don't lock the socket,
2244                  * we might find a transient negative value.
2245                  */
2246                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2247
2248         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2249                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2250                 i, src, srcp, dest, destp, state,
2251                 tp->write_seq - tp->snd_una,
2252                 rx_queue,
2253                 timer_active,
2254                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2255                 icsk->icsk_retransmits,
2256                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2257                 icsk->icsk_probes_out,
2258                 sock_i_ino(sk),
2259                 atomic_read(&sk->sk_refcnt), sk,
2260                 jiffies_to_clock_t(icsk->icsk_rto),
2261                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2262                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2263                 tp->snd_cwnd,
2264                 state == TCP_LISTEN ?
2265                     fastopenq->max_qlen :
2266                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2267 }
2268
2269 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2270                                struct seq_file *f, int i)
2271 {
2272         long delta = tw->tw_timer.expires - jiffies;
2273         __be32 dest, src;
2274         __u16 destp, srcp;
2275
2276         dest  = tw->tw_daddr;
2277         src   = tw->tw_rcv_saddr;
2278         destp = ntohs(tw->tw_dport);
2279         srcp  = ntohs(tw->tw_sport);
2280
2281         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2282                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2283                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2284                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2285                 atomic_read(&tw->tw_refcnt), tw);
2286 }
2287
2288 #define TMPSZ 150
2289
2290 static int tcp4_seq_show(struct seq_file *seq, void *v)
2291 {
2292         struct tcp_iter_state *st;
2293         struct sock *sk = v;
2294
2295         seq_setwidth(seq, TMPSZ - 1);
2296         if (v == SEQ_START_TOKEN) {
2297                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2298                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2299                            "inode");
2300                 goto out;
2301         }
2302         st = seq->private;
2303
2304         if (sk->sk_state == TCP_TIME_WAIT)
2305                 get_timewait4_sock(v, seq, st->num);
2306         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2307                 get_openreq4(v, seq, st->num);
2308         else
2309                 get_tcp4_sock(v, seq, st->num);
2310 out:
2311         seq_pad(seq, '\n');
2312         return 0;
2313 }
2314
2315 static const struct file_operations tcp_afinfo_seq_fops = {
2316         .owner   = THIS_MODULE,
2317         .open    = tcp_seq_open,
2318         .read    = seq_read,
2319         .llseek  = seq_lseek,
2320         .release = seq_release_net
2321 };
2322
2323 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2324         .name           = "tcp",
2325         .family         = AF_INET,
2326         .seq_fops       = &tcp_afinfo_seq_fops,
2327         .seq_ops        = {
2328                 .show           = tcp4_seq_show,
2329         },
2330 };
2331
2332 static int __net_init tcp4_proc_init_net(struct net *net)
2333 {
2334         return tcp_proc_register(net, &tcp4_seq_afinfo);
2335 }
2336
2337 static void __net_exit tcp4_proc_exit_net(struct net *net)
2338 {
2339         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2340 }
2341
2342 static struct pernet_operations tcp4_net_ops = {
2343         .init = tcp4_proc_init_net,
2344         .exit = tcp4_proc_exit_net,
2345 };
2346
2347 int __init tcp4_proc_init(void)
2348 {
2349         return register_pernet_subsys(&tcp4_net_ops);
2350 }
2351
2352 void tcp4_proc_exit(void)
2353 {
2354         unregister_pernet_subsys(&tcp4_net_ops);
2355 }
2356 #endif /* CONFIG_PROC_FS */
2357
2358 struct proto tcp_prot = {
2359         .name                   = "TCP",
2360         .owner                  = THIS_MODULE,
2361         .close                  = tcp_close,
2362         .connect                = tcp_v4_connect,
2363         .disconnect             = tcp_disconnect,
2364         .accept                 = inet_csk_accept,
2365         .ioctl                  = tcp_ioctl,
2366         .init                   = tcp_v4_init_sock,
2367         .destroy                = tcp_v4_destroy_sock,
2368         .shutdown               = tcp_shutdown,
2369         .setsockopt             = tcp_setsockopt,
2370         .getsockopt             = tcp_getsockopt,
2371         .keepalive              = tcp_set_keepalive,
2372         .recvmsg                = tcp_recvmsg,
2373         .sendmsg                = tcp_sendmsg,
2374         .sendpage               = tcp_sendpage,
2375         .backlog_rcv            = tcp_v4_do_rcv,
2376         .release_cb             = tcp_release_cb,
2377         .hash                   = inet_hash,
2378         .unhash                 = inet_unhash,
2379         .get_port               = inet_csk_get_port,
2380         .enter_memory_pressure  = tcp_enter_memory_pressure,
2381         .stream_memory_free     = tcp_stream_memory_free,
2382         .sockets_allocated      = &tcp_sockets_allocated,
2383         .orphan_count           = &tcp_orphan_count,
2384         .memory_allocated       = &tcp_memory_allocated,
2385         .memory_pressure        = &tcp_memory_pressure,
2386         .sysctl_mem             = sysctl_tcp_mem,
2387         .sysctl_wmem            = sysctl_tcp_wmem,
2388         .sysctl_rmem            = sysctl_tcp_rmem,
2389         .max_header             = MAX_TCP_HEADER,
2390         .obj_size               = sizeof(struct tcp_sock),
2391         .slab_flags             = SLAB_DESTROY_BY_RCU,
2392         .twsk_prot              = &tcp_timewait_sock_ops,
2393         .rsk_prot               = &tcp_request_sock_ops,
2394         .h.hashinfo             = &tcp_hashinfo,
2395         .no_autobind            = true,
2396 #ifdef CONFIG_COMPAT
2397         .compat_setsockopt      = compat_tcp_setsockopt,
2398         .compat_getsockopt      = compat_tcp_getsockopt,
2399 #endif
2400         .diag_destroy           = tcp_abort,
2401 };
2402 EXPORT_SYMBOL(tcp_prot);
2403
2404 static void __net_exit tcp_sk_exit(struct net *net)
2405 {
2406         int cpu;
2407
2408         for_each_possible_cpu(cpu)
2409                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2410         free_percpu(net->ipv4.tcp_sk);
2411 }
2412
2413 static int __net_init tcp_sk_init(struct net *net)
2414 {
2415         int res, cpu, cnt;
2416
2417         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2418         if (!net->ipv4.tcp_sk)
2419                 return -ENOMEM;
2420
2421         for_each_possible_cpu(cpu) {
2422                 struct sock *sk;
2423
2424                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2425                                            IPPROTO_TCP, net);
2426                 if (res)
2427                         goto fail;
2428                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2429                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2430         }
2431
2432         net->ipv4.sysctl_tcp_ecn = 2;
2433         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2434
2435         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2436         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2437         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2438
2439         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2440         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2441         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2442
2443         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2444         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2445         net->ipv4.sysctl_tcp_syncookies = 1;
2446         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2447         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2448         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2449         net->ipv4.sysctl_tcp_orphan_retries = 0;
2450         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2451         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2452         net->ipv4.sysctl_tcp_tw_reuse = 0;
2453
2454         cnt = tcp_hashinfo.ehash_mask + 1;
2455         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2456         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2457
2458         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2459
2460         return 0;
2461 fail:
2462         tcp_sk_exit(net);
2463
2464         return res;
2465 }
2466
2467 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2468 {
2469         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2470 }
2471
2472 static struct pernet_operations __net_initdata tcp_sk_ops = {
2473        .init       = tcp_sk_init,
2474        .exit       = tcp_sk_exit,
2475        .exit_batch = tcp_sk_exit_batch,
2476 };
2477
2478 void __init tcp_v4_init(void)
2479 {
2480         if (register_pernet_subsys(&tcp_sk_ops))
2481                 panic("Failed to create the TCP control socket.\n");
2482 }