net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      request_sock handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen sematics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/transp_v6.h>
  70 #include <net/ipv6.h>
  71 #include <net/inet_common.h>
  72 #include <net/xfrm.h>
  73
  74 #include <linux/inet.h>
  75 #include <linux/ipv6.h>
  76 #include <linux/stddef.h>
  77 #include <linux/proc_fs.h>
  78 #include <linux/seq_file.h>
  79
  80 int sysctl_tcp_tw_reuse;
  81 int sysctl_tcp_low_latency;
  82
  83 /* Check TCP sequence numbers in ICMP packets. */
  84 #define ICMP_MIN_LENGTH 8
  85
  86 /* Socket used for sending RSTs */
  87 static struct socket *tcp_socket;
  88
  89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  90                        struct sk_buff *skb);
  91
  92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
  93         .lhash_lock     = RW_LOCK_UNLOCKED,
  94         .lhash_users    = ATOMIC_INIT(0),
  95         .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
  96 };
  97
  98 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
  99 {
 100         return inet_csk_get_port(&tcp_hashinfo, sk, snum);
 101 }
 102
 103 static void tcp_v4_hash(struct sock *sk)
 104 {
 105         inet_hash(&tcp_hashinfo, sk);
 106 }
 107
 108 void tcp_unhash(struct sock *sk)
 109 {
 110         inet_unhash(&tcp_hashinfo, sk);
 111 }
 112
 113 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 114 {
 115         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 116                                           skb->nh.iph->saddr,
 117                                           skb->h.th->dest,
 118                                           skb->h.th->source);
 119 }
 120
 121 /* called with local bh disabled */
 122 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 123                                       struct inet_timewait_sock **twp)
 124 {
 125         struct inet_sock *inet = inet_sk(sk);
 126         u32 daddr = inet->rcv_saddr;
 127         u32 saddr = inet->daddr;
 128         int dif = sk->sk_bound_dev_if;
 129         INET_ADDR_COOKIE(acookie, saddr, daddr)
 130         const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
 131         unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
 132         struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
 133         struct sock *sk2;
 134         const struct hlist_node *node;
 135         struct inet_timewait_sock *tw;
 136
 137         prefetch(head->chain.first);
 138         write_lock(&head->lock);
 139
 140         /* Check TIME-WAIT sockets first. */
 141         sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
 142                 tw = inet_twsk(sk2);
 143
 144                 if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
 145                         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
 146                         struct tcp_sock *tp = tcp_sk(sk);
 147
 148                         /* With PAWS, it is safe from the viewpoint
 149                            of data integrity. Even without PAWS it
 150                            is safe provided sequence spaces do not
 151                            overlap i.e. at data rates <= 80Mbit/sec.
 152
 153                            Actually, the idea is close to VJ's one,
 154                            only timestamp cache is held not per host,
 155                            but per port pair and TW bucket is used
 156                            as state holder.
 157
 158                            If TW bucket has been already destroyed we
 159                            fall back to VJ's scheme and use initial
 160                            timestamp retrieved from peer table.
 161                          */
 162                         if (tcptw->tw_ts_recent_stamp &&
 163                             (!twp || (sysctl_tcp_tw_reuse &&
 164                                       xtime.tv_sec -
 165                                       tcptw->tw_ts_recent_stamp > 1))) {
 166                                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 167                                 if (tp->write_seq == 0)
 168                                         tp->write_seq = 1;
 169                                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 170                                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 171                                 sock_hold(sk2);
 172                                 goto unique;
 173                         } else
 174                                 goto not_unique;
 175                 }
 176         }
 177         tw = NULL;
 178
 179         /* And established part... */
 180         sk_for_each(sk2, node, &head->chain) {
 181                 if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
 182                         goto not_unique;
 183         }
 184
 185 unique:
 186         /* Must record num and sport now. Otherwise we will see
 187          * in hash table socket with a funny identity. */
 188         inet->num = lport;
 189         inet->sport = htons(lport);
 190         sk->sk_hash = hash;
 191         BUG_TRAP(sk_unhashed(sk));
 192         __sk_add_node(sk, &head->chain);
 193         sock_prot_inc_use(sk->sk_prot);
 194         write_unlock(&head->lock);
 195
 196         if (twp) {
 197                 *twp = tw;
 198                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 199         } else if (tw) {
 200                 /* Silly. Should hash-dance instead... */
 201                 inet_twsk_deschedule(tw, &tcp_death_row);
 202                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 203
 204                 inet_twsk_put(tw);
 205         }
 206
 207         return 0;
 208
 209 not_unique:
 210         write_unlock(&head->lock);
 211         return -EADDRNOTAVAIL;
 212 }
 213
 214 static inline u32 connect_port_offset(const struct sock *sk)
 215 {
 216         const struct inet_sock *inet = inet_sk(sk);
 217
 218         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
 219                                          inet->dport);
 220 }
 221
 222 /*
 223  * Bind a port for a connect operation and hash it.
 224  */
 225 static inline int tcp_v4_hash_connect(struct sock *sk)
 226 {
 227         const unsigned short snum = inet_sk(sk)->num;
 228         struct inet_bind_hashbucket *head;
 229         struct inet_bind_bucket *tb;
 230         int ret;
 231
 232         if (!snum) {
 233                 int low = sysctl_local_port_range[0];
 234                 int high = sysctl_local_port_range[1];
 235                 int range = high - low;
 236                 int i;
 237                 int port;
 238                 static u32 hint;
 239                 u32 offset = hint + connect_port_offset(sk);
 240                 struct hlist_node *node;
 241                 struct inet_timewait_sock *tw = NULL;
 242
 243                 local_bh_disable();
 244                 for (i = 1; i <= range; i++) {
 245                         port = low + (i + offset) % range;
 246                         head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
 247                         spin_lock(&head->lock);
 248
 249                         /* Does not bother with rcv_saddr checks,
 250                          * because the established check is already
 251                          * unique enough.
 252                          */
 253                         inet_bind_bucket_for_each(tb, node, &head->chain) {
 254                                 if (tb->port == port) {
 255                                         BUG_TRAP(!hlist_empty(&tb->owners));
 256                                         if (tb->fastreuse >= 0)
 257                                                 goto next_port;
 258                                         if (!__tcp_v4_check_established(sk,
 259                                                                         port,
 260                                                                         &tw))
 261                                                 goto ok;
 262                                         goto next_port;
 263                                 }
 264                         }
 265
 266                         tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
 267                         if (!tb) {
 268                                 spin_unlock(&head->lock);
 269                                 break;
 270                         }
 271                         tb->fastreuse = -1;
 272                         goto ok;
 273
 274                 next_port:
 275                         spin_unlock(&head->lock);
 276                 }
 277                 local_bh_enable();
 278
 279                 return -EADDRNOTAVAIL;
 280
 281 ok:
 282                 hint += i;
 283
 284                 /* Head lock still held and bh's disabled */
 285                 inet_bind_hash(sk, tb, port);
 286                 if (sk_unhashed(sk)) {
 287                         inet_sk(sk)->sport = htons(port);
 288                         __inet_hash(&tcp_hashinfo, sk, 0);
 289                 }
 290                 spin_unlock(&head->lock);
 291
 292                 if (tw) {
 293                         inet_twsk_deschedule(tw, &tcp_death_row);;
 294                         inet_twsk_put(tw);
 295                 }
 296
 297                 ret = 0;
 298                 goto out;
 299         }
 300
 301         head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
 302         tb  = inet_csk(sk)->icsk_bind_hash;
 303         spin_lock_bh(&head->lock);
 304         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 305                 __inet_hash(&tcp_hashinfo, sk, 0);
 306                 spin_unlock_bh(&head->lock);
 307                 return 0;
 308         } else {
 309                 spin_unlock(&head->lock);
 310                 /* No definite answer... Walk to established hash table */
 311                 ret = __tcp_v4_check_established(sk, snum, NULL);
 312 out:
 313                 local_bh_enable();
 314                 return ret;
 315         }
 316 }
 317
 318 /* This will initiate an outgoing connection. */
 319 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 320 {
 321         struct inet_sock *inet = inet_sk(sk);
 322         struct tcp_sock *tp = tcp_sk(sk);
 323         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 324         struct rtable *rt;
 325         u32 daddr, nexthop;
 326         int tmp;
 327         int err;
 328
 329         if (addr_len < sizeof(struct sockaddr_in))
 330                 return -EINVAL;
 331
 332         if (usin->sin_family != AF_INET)
 333                 return -EAFNOSUPPORT;
 334
 335         nexthop = daddr = usin->sin_addr.s_addr;
 336         if (inet->opt && inet->opt->srr) {
 337                 if (!daddr)
 338                         return -EINVAL;
 339                 nexthop = inet->opt->faddr;
 340         }
 341
 342         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 343                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 344                                IPPROTO_TCP,
 345                                inet->sport, usin->sin_port, sk);
 346         if (tmp < 0)
 347                 return tmp;
 348
 349         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 350                 ip_rt_put(rt);
 351                 return -ENETUNREACH;
 352         }
 353
 354         if (!inet->opt || !inet->opt->srr)
 355                 daddr = rt->rt_dst;
 356
 357         if (!inet->saddr)
 358                 inet->saddr = rt->rt_src;
 359         inet->rcv_saddr = inet->saddr;
 360
 361         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 362                 /* Reset inherited state */
 363                 tp->rx_opt.ts_recent       = 0;
 364                 tp->rx_opt.ts_recent_stamp = 0;
 365                 tp->write_seq              = 0;
 366         }
 367
 368         if (tcp_death_row.sysctl_tw_recycle &&
 369             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 370                 struct inet_peer *peer = rt_get_peer(rt);
 371
 372                 /* VJ's idea. We save last timestamp seen from
 373                  * the destination in peer table, when entering state TIME-WAIT
 374                  * and initialize rx_opt.ts_recent from it, when trying new connection.
 375                  */
 376
 377                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 378                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 379                         tp->rx_opt.ts_recent = peer->tcp_ts;
 380                 }
 381         }
 382
 383         inet->dport = usin->sin_port;
 384         inet->daddr = daddr;
 385
 386         tp->ext_header_len = 0;
 387         if (inet->opt)
 388                 tp->ext_header_len = inet->opt->optlen;
 389
 390         tp->rx_opt.mss_clamp = 536;
 391
 392         /* Socket identity is still unknown (sport may be zero).
 393          * However we set state to SYN-SENT and not releasing socket
 394          * lock select source port, enter ourselves into the hash tables and
 395          * complete initialization after this.
 396          */
 397         tcp_set_state(sk, TCP_SYN_SENT);
 398         err = tcp_v4_hash_connect(sk);
 399         if (err)
 400                 goto failure;
 401
 402         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 403         if (err)
 404                 goto failure;
 405
 406         /* OK, now commit destination to socket.  */
 407         sk_setup_caps(sk, &rt->u.dst);
 408
 409         if (!tp->write_seq)
 410                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 411                                                            inet->daddr,
 412                                                            inet->sport,
 413                                                            usin->sin_port);
 414
 415         inet->id = tp->write_seq ^ jiffies;
 416
 417         err = tcp_connect(sk);
 418         rt = NULL;
 419         if (err)
 420                 goto failure;
 421
 422         return 0;
 423
 424 failure:
 425         /* This unhashes the socket and releases the local port, if necessary. */
 426         tcp_set_state(sk, TCP_CLOSE);
 427         ip_rt_put(rt);
 428         sk->sk_route_caps = 0;
 429         inet->dport = 0;
 430         return err;
 431 }
 432
 433 /*
 434  * This routine does path mtu discovery as defined in RFC1191.
 435  */
 436 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 437                                      u32 mtu)
 438 {
 439         struct dst_entry *dst;
 440         struct inet_sock *inet = inet_sk(sk);
 441         struct tcp_sock *tp = tcp_sk(sk);
 442
 443         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 444          * send out by Linux are always <576bytes so they should go through
 445          * unfragmented).
 446          */
 447         if (sk->sk_state == TCP_LISTEN)
 448                 return;
 449
 450         /* We don't check in the destentry if pmtu discovery is forbidden
 451          * on this route. We just assume that no packet_to_big packets
 452          * are send back when pmtu discovery is not active.
 453          * There is a small race when the user changes this flag in the
 454          * route, but I think that's acceptable.
 455          */
 456         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 457                 return;
 458
 459         dst->ops->update_pmtu(dst, mtu);
 460
 461         /* Something is about to be wrong... Remember soft error
 462          * for the case, if this connection will not able to recover.
 463          */
 464         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 465                 sk->sk_err_soft = EMSGSIZE;
 466
 467         mtu = dst_mtu(dst);
 468
 469         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 470             tp->pmtu_cookie > mtu) {
 471                 tcp_sync_mss(sk, mtu);
 472
 473                 /* Resend the TCP packet because it's
 474                  * clear that the old packet has been
 475                  * dropped. This is the new "fast" path mtu
 476                  * discovery.
 477                  */
 478                 tcp_simple_retransmit(sk);
 479         } /* else let the usual retransmit timer handle it */
 480 }
 481
 482 /*
 483  * This routine is called by the ICMP module when it gets some
 484  * sort of error condition.  If err < 0 then the socket should
 485  * be closed and the error returned to the user.  If err > 0
 486  * it's just the icmp type << 8 | icmp code.  After adjustment
 487  * header points to the first 8 bytes of the tcp header.  We need
 488  * to find the appropriate port.
 489  *
 490  * The locking strategy used here is very "optimistic". When
 491  * someone else accesses the socket the ICMP is just dropped
 492  * and for some paths there is no check at all.
 493  * A more general error queue to queue errors for later handling
 494  * is probably better.
 495  *
 496  */
 497
 498 void tcp_v4_err(struct sk_buff *skb, u32 info)
 499 {
 500         struct iphdr *iph = (struct iphdr *)skb->data;
 501         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 502         struct tcp_sock *tp;
 503         struct inet_sock *inet;
 504         int type = skb->h.icmph->type;
 505         int code = skb->h.icmph->code;
 506         struct sock *sk;
 507         __u32 seq;
 508         int err;
 509
 510         if (skb->len < (iph->ihl << 2) + 8) {
 511                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 512                 return;
 513         }
 514
 515         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
 516                          th->source, inet_iif(skb));
 517         if (!sk) {
 518                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 519                 return;
 520         }
 521         if (sk->sk_state == TCP_TIME_WAIT) {
 522                 inet_twsk_put((struct inet_timewait_sock *)sk);
 523                 return;
 524         }
 525
 526         bh_lock_sock(sk);
 527         /* If too many ICMPs get dropped on busy
 528          * servers this needs to be solved differently.
 529          */
 530         if (sock_owned_by_user(sk))
 531                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
 532
 533         if (sk->sk_state == TCP_CLOSE)
 534                 goto out;
 535
 536         tp = tcp_sk(sk);
 537         seq = ntohl(th->seq);
 538         if (sk->sk_state != TCP_LISTEN &&
 539             !between(seq, tp->snd_una, tp->snd_nxt)) {
 540                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
 541                 goto out;
 542         }
 543
 544         switch (type) {
 545         case ICMP_SOURCE_QUENCH:
 546                 /* Just silently ignore these. */
 547                 goto out;
 548         case ICMP_PARAMETERPROB:
 549                 err = EPROTO;
 550                 break;
 551         case ICMP_DEST_UNREACH:
 552                 if (code > NR_ICMP_UNREACH)
 553                         goto out;
 554
 555                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 556                         if (!sock_owned_by_user(sk))
 557                                 do_pmtu_discovery(sk, iph, info);
 558                         goto out;
 559                 }
 560
 561                 err = icmp_err_convert[code].errno;
 562                 break;
 563         case ICMP_TIME_EXCEEDED:
 564                 err = EHOSTUNREACH;
 565                 break;
 566         default:
 567                 goto out;
 568         }
 569
 570         switch (sk->sk_state) {
 571                 struct request_sock *req, **prev;
 572         case TCP_LISTEN:
 573                 if (sock_owned_by_user(sk))
 574                         goto out;
 575
 576                 req = inet_csk_search_req(sk, &prev, th->dest,
 577                                           iph->daddr, iph->saddr);
 578                 if (!req)
 579                         goto out;
 580
 581                 /* ICMPs are not backlogged, hence we cannot get
 582                    an established socket here.
 583                  */
 584                 BUG_TRAP(!req->sk);
 585
 586                 if (seq != tcp_rsk(req)->snt_isn) {
 587                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 588                         goto out;
 589                 }
 590
 591                 /*
 592                  * Still in SYN_RECV, just remove it silently.
 593                  * There is no good way to pass the error to the newly
 594                  * created socket, and POSIX does not want network
 595                  * errors returned from accept().
 596                  */
 597                 inet_csk_reqsk_queue_drop(sk, req, prev);
 598                 goto out;
 599
 600         case TCP_SYN_SENT:
 601         case TCP_SYN_RECV:  /* Cannot happen.
 602                                It can f.e. if SYNs crossed.
 603                              */
 604                 if (!sock_owned_by_user(sk)) {
 605                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
 606                         sk->sk_err = err;
 607
 608                         sk->sk_error_report(sk);
 609
 610                         tcp_done(sk);
 611                 } else {
 612                         sk->sk_err_soft = err;
 613                 }
 614                 goto out;
 615         }
 616
 617         /* If we've already connected we will keep trying
 618          * until we time out, or the user gives up.
 619          *
 620          * rfc1122 4.2.3.9 allows to consider as hard errors
 621          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 622          * but it is obsoleted by pmtu discovery).
 623          *
 624          * Note, that in modern internet, where routing is unreliable
 625          * and in each dark corner broken firewalls sit, sending random
 626          * errors ordered by their masters even this two messages finally lose
 627          * their original sense (even Linux sends invalid PORT_UNREACHs)
 628          *
 629          * Now we are in compliance with RFCs.
 630          *                                                      --ANK (980905)
 631          */
 632
 633         inet = inet_sk(sk);
 634         if (!sock_owned_by_user(sk) && inet->recverr) {
 635                 sk->sk_err = err;
 636                 sk->sk_error_report(sk);
 637         } else  { /* Only an error on timeout */
 638                 sk->sk_err_soft = err;
 639         }
 640
 641 out:
 642         bh_unlock_sock(sk);
 643         sock_put(sk);
 644 }
 645
 646 /* This routine computes an IPv4 TCP checksum. */
 647 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
 648                        struct sk_buff *skb)
 649 {
 650         struct inet_sock *inet = inet_sk(sk);
 651
 652         if (skb->ip_summed == CHECKSUM_HW) {
 653                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
 654                 skb->csum = offsetof(struct tcphdr, check);
 655         } else {
 656                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
 657                                          csum_partial((char *)th,
 658                                                       th->doff << 2,
 659                                                       skb->csum));
 660         }
 661 }
 662
 663 /*
 664  *      This routine will send an RST to the other tcp.
 665  *
 666  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 667  *                    for reset.
 668  *      Answer: if a packet caused RST, it is not for a socket
 669  *              existing in our system, if it is matched to a socket,
 670  *              it is just duplicate segment or bug in other side's TCP.
 671  *              So that we build reply only basing on parameters
 672  *              arrived with segment.
 673  *      Exception: precedence violation. We do not implement it in any case.
 674  */
 675
 676 static void tcp_v4_send_reset(struct sk_buff *skb)
 677 {
 678         struct tcphdr *th = skb->h.th;
 679         struct tcphdr rth;
 680         struct ip_reply_arg arg;
 681
 682         /* Never send a reset in response to a reset. */
 683         if (th->rst)
 684                 return;
 685
 686         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
 687                 return;
 688
 689         /* Swap the send and the receive. */
 690         memset(&rth, 0, sizeof(struct tcphdr));
 691         rth.dest   = th->source;
 692         rth.source = th->dest;
 693         rth.doff   = sizeof(struct tcphdr) / 4;
 694         rth.rst    = 1;
 695
 696         if (th->ack) {
 697                 rth.seq = th->ack_seq;
 698         } else {
 699                 rth.ack = 1;
 700                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 701                                     skb->len - (th->doff << 2));
 702         }
 703
 704         memset(&arg, 0, sizeof arg);
 705         arg.iov[0].iov_base = (unsigned char *)&rth;
 706         arg.iov[0].iov_len  = sizeof rth;
 707         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
 708                                       skb->nh.iph->saddr, /*XXX*/
 709                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
 710         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 711
 712         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
 713
 714         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 715         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
 716 }
 717
 718 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 719    outside socket context is ugly, certainly. What can I do?
 720  */
 721
 722 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 723                             u32 win, u32 ts)
 724 {
 725         struct tcphdr *th = skb->h.th;
 726         struct {
 727                 struct tcphdr th;
 728                 u32 tsopt[3];
 729         } rep;
 730         struct ip_reply_arg arg;
 731
 732         memset(&rep.th, 0, sizeof(struct tcphdr));
 733         memset(&arg, 0, sizeof arg);
 734
 735         arg.iov[0].iov_base = (unsigned char *)&rep;
 736         arg.iov[0].iov_len  = sizeof(rep.th);
 737         if (ts) {
 738                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 739                                      (TCPOPT_TIMESTAMP << 8) |
 740                                      TCPOLEN_TIMESTAMP);
 741                 rep.tsopt[1] = htonl(tcp_time_stamp);
 742                 rep.tsopt[2] = htonl(ts);
 743                 arg.iov[0].iov_len = sizeof(rep);
 744         }
 745
 746         /* Swap the send and the receive. */
 747         rep.th.dest    = th->source;
 748         rep.th.source  = th->dest;
 749         rep.th.doff    = arg.iov[0].iov_len / 4;
 750         rep.th.seq     = htonl(seq);
 751         rep.th.ack_seq = htonl(ack);
 752         rep.th.ack     = 1;
 753         rep.th.window  = htons(win);
 754
 755         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
 756                                       skb->nh.iph->saddr, /*XXX*/
 757                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 758         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 759
 760         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
 761
 762         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 763 }
 764
 765 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 766 {
 767         struct inet_timewait_sock *tw = inet_twsk(sk);
 768         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 769
 770         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 771                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
 772
 773         inet_twsk_put(tw);
 774 }
 775
 776 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
 777 {
 778         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 779                         req->ts_recent);
 780 }
 781
 782 /*
 783  *      Send a SYN-ACK after having received an ACK.
 784  *      This still operates on a request_sock only, not on a big
 785  *      socket.
 786  */
 787 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 788                               struct dst_entry *dst)
 789 {
 790         const struct inet_request_sock *ireq = inet_rsk(req);
 791         int err = -1;
 792         struct sk_buff * skb;
 793
 794         /* First, grab a route. */
 795         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 796                 goto out;
 797
 798         skb = tcp_make_synack(sk, dst, req);
 799
 800         if (skb) {
 801                 struct tcphdr *th = skb->h.th;
 802
 803                 th->check = tcp_v4_check(th, skb->len,
 804                                          ireq->loc_addr,
 805                                          ireq->rmt_addr,
 806                                          csum_partial((char *)th, skb->len,
 807                                                       skb->csum));
 808
 809                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 810                                             ireq->rmt_addr,
 811                                             ireq->opt);
 812                 if (err == NET_XMIT_CN)
 813                         err = 0;
 814         }
 815
 816 out:
 817         dst_release(dst);
 818         return err;
 819 }
 820
 821 /*
 822  *      IPv4 request_sock destructor.
 823  */
 824 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 825 {
 826         if (inet_rsk(req)->opt)
 827                 kfree(inet_rsk(req)->opt);
 828 }
 829
 830 static inline void syn_flood_warning(struct sk_buff *skb)
 831 {
 832         static unsigned long warntime;
 833
 834         if (time_after(jiffies, (warntime + HZ * 60))) {
 835                 warntime = jiffies;
 836                 printk(KERN_INFO
 837                        "possible SYN flooding on port %d. Sending cookies.\n",
 838                        ntohs(skb->h.th->dest));
 839         }
 840 }
 841
 842 /*
 843  * Save and compile IPv4 options into the request_sock if needed.
 844  */
 845 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
 846                                                      struct sk_buff *skb)
 847 {
 848         struct ip_options *opt = &(IPCB(skb)->opt);
 849         struct ip_options *dopt = NULL;
 850
 851         if (opt && opt->optlen) {
 852                 int opt_size = optlength(opt);
 853                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 854                 if (dopt) {
 855                         if (ip_options_echo(dopt, skb)) {
 856                                 kfree(dopt);
 857                                 dopt = NULL;
 858                         }
 859                 }
 860         }
 861         return dopt;
 862 }
 863
 864 struct request_sock_ops tcp_request_sock_ops = {
 865         .family         =       PF_INET,
 866         .obj_size       =       sizeof(struct tcp_request_sock),
 867         .rtx_syn_ack    =       tcp_v4_send_synack,
 868         .send_ack       =       tcp_v4_reqsk_send_ack,
 869         .destructor     =       tcp_v4_reqsk_destructor,
 870         .send_reset     =       tcp_v4_send_reset,
 871 };
 872
 873 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 874 {
 875         struct inet_request_sock *ireq;
 876         struct tcp_options_received tmp_opt;
 877         struct request_sock *req;
 878         __u32 saddr = skb->nh.iph->saddr;
 879         __u32 daddr = skb->nh.iph->daddr;
 880         __u32 isn = TCP_SKB_CB(skb)->when;
 881         struct dst_entry *dst = NULL;
 882 #ifdef CONFIG_SYN_COOKIES
 883         int want_cookie = 0;
 884 #else
 885 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
 886 #endif
 887
 888         /* Never answer to SYNs send to broadcast or multicast */
 889         if (((struct rtable *)skb->dst)->rt_flags &
 890             (RTCF_BROADCAST | RTCF_MULTICAST))
 891                 goto drop;
 892
 893         /* TW buckets are converted to open requests without
 894          * limitations, they conserve resources and peer is
 895          * evidently real one.
 896          */
 897         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
 898 #ifdef CONFIG_SYN_COOKIES
 899                 if (sysctl_tcp_syncookies) {
 900                         want_cookie = 1;
 901                 } else
 902 #endif
 903                 goto drop;
 904         }
 905
 906         /* Accept backlog is full. If we have already queued enough
 907          * of warm entries in syn queue, drop request. It is better than
 908          * clogging syn queue with openreqs with exponentially increasing
 909          * timeout.
 910          */
 911         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
 912                 goto drop;
 913
 914         req = reqsk_alloc(&tcp_request_sock_ops);
 915         if (!req)
 916                 goto drop;
 917
 918         tcp_clear_options(&tmp_opt);
 919         tmp_opt.mss_clamp = 536;
 920         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
 921
 922         tcp_parse_options(skb, &tmp_opt, 0);
 923
 924         if (want_cookie) {
 925                 tcp_clear_options(&tmp_opt);
 926                 tmp_opt.saw_tstamp = 0;
 927         }
 928
 929         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
 930                 /* Some OSes (unknown ones, but I see them on web server, which
 931                  * contains information interesting only for windows'
 932                  * users) do not send their stamp in SYN. It is easy case.
 933                  * We simply do not advertise TS support.
 934                  */
 935                 tmp_opt.saw_tstamp = 0;
 936                 tmp_opt.tstamp_ok  = 0;
 937         }
 938         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 939
 940         tcp_openreq_init(req, &tmp_opt, skb);
 941
 942         ireq = inet_rsk(req);
 943         ireq->loc_addr = daddr;
 944         ireq->rmt_addr = saddr;
 945         ireq->opt = tcp_v4_save_options(sk, skb);
 946         if (!want_cookie)
 947                 TCP_ECN_create_request(req, skb->h.th);
 948
 949         if (want_cookie) {
 950 #ifdef CONFIG_SYN_COOKIES
 951                 syn_flood_warning(skb);
 952 #endif
 953                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
 954         } else if (!isn) {
 955                 struct inet_peer *peer = NULL;
 956
 957                 /* VJ's idea. We save last timestamp seen
 958                  * from the destination in peer table, when entering
 959                  * state TIME-WAIT, and check against it before
 960                  * accepting new connection request.
 961                  *
 962                  * If "isn" is not zero, this request hit alive
 963                  * timewait bucket, so that all the necessary checks
 964                  * are made in the function processing timewait state.
 965                  */
 966                 if (tmp_opt.saw_tstamp &&
 967                     tcp_death_row.sysctl_tw_recycle &&
 968                     (dst = inet_csk_route_req(sk, req)) != NULL &&
 969                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
 970                     peer->v4daddr == saddr) {
 971                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
 972                             (s32)(peer->tcp_ts - req->ts_recent) >
 973                                                         TCP_PAWS_WINDOW) {
 974                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
 975                                 dst_release(dst);
 976                                 goto drop_and_free;
 977                         }
 978                 }
 979                 /* Kill the following clause, if you dislike this way. */
 980                 else if (!sysctl_tcp_syncookies &&
 981                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 982                           (sysctl_max_syn_backlog >> 2)) &&
 983                          (!peer || !peer->tcp_ts_stamp) &&
 984                          (!dst || !dst_metric(dst, RTAX_RTT))) {
 985                         /* Without syncookies last quarter of
 986                          * backlog is filled with destinations,
 987                          * proven to be alive.
 988                          * It means that we continue to communicate
 989                          * to destinations, already remembered
 990                          * to the moment of synflood.
 991                          */
 992                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
 993                                        "request from %u.%u.%u.%u/%u\n",
 994                                        NIPQUAD(saddr),
 995                                        ntohs(skb->h.th->source));
 996                         dst_release(dst);
 997                         goto drop_and_free;
 998                 }
 999
1000                 isn = tcp_v4_init_sequence(sk, skb);
1001         }
1002         tcp_rsk(req)->snt_isn = isn;
1003
1004         if (tcp_v4_send_synack(sk, req, dst))
1005                 goto drop_and_free;
1006
1007         if (want_cookie) {
1008                 reqsk_free(req);
1009         } else {
1010                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1011         }
1012         return 0;
1013
1014 drop_and_free:
1015         reqsk_free(req);
1016 drop:
1017         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1018         return 0;
1019 }
1020
1021
1022 /*
1023  * The three way handshake has completed - we got a valid synack -
1024  * now create the new socket.
1025  */
1026 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1027                                   struct request_sock *req,
1028                                   struct dst_entry *dst)
1029 {
1030         struct inet_request_sock *ireq;
1031         struct inet_sock *newinet;
1032         struct tcp_sock *newtp;
1033         struct sock *newsk;
1034
1035         if (sk_acceptq_is_full(sk))
1036                 goto exit_overflow;
1037
1038         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1039                 goto exit;
1040
1041         newsk = tcp_create_openreq_child(sk, req, skb);
1042         if (!newsk)
1043                 goto exit;
1044
1045         sk_setup_caps(newsk, dst);
1046
1047         newtp                 = tcp_sk(newsk);
1048         newinet               = inet_sk(newsk);
1049         ireq                  = inet_rsk(req);
1050         newinet->daddr        = ireq->rmt_addr;
1051         newinet->rcv_saddr    = ireq->loc_addr;
1052         newinet->saddr        = ireq->loc_addr;
1053         newinet->opt          = ireq->opt;
1054         ireq->opt             = NULL;
1055         newinet->mc_index     = inet_iif(skb);
1056         newinet->mc_ttl       = skb->nh.iph->ttl;
1057         newtp->ext_header_len = 0;
1058         if (newinet->opt)
1059                 newtp->ext_header_len = newinet->opt->optlen;
1060         newinet->id = newtp->write_seq ^ jiffies;
1061
1062         tcp_sync_mss(newsk, dst_mtu(dst));
1063         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1064         tcp_initialize_rcv_mss(newsk);
1065
1066         __inet_hash(&tcp_hashinfo, newsk, 0);
1067         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1068
1069         return newsk;
1070
1071 exit_overflow:
1072         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1073 exit:
1074         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1075         dst_release(dst);
1076         return NULL;
1077 }
1078
1079 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1080 {
1081         struct tcphdr *th = skb->h.th;
1082         struct iphdr *iph = skb->nh.iph;
1083         struct sock *nsk;
1084         struct request_sock **prev;
1085         /* Find possible connection requests. */
1086         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1087                                                        iph->saddr, iph->daddr);
1088         if (req)
1089                 return tcp_check_req(sk, skb, req, prev);
1090
1091         nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1092                                         th->source, skb->nh.iph->daddr,
1093                                         ntohs(th->dest), inet_iif(skb));
1094
1095         if (nsk) {
1096                 if (nsk->sk_state != TCP_TIME_WAIT) {
1097                         bh_lock_sock(nsk);
1098                         return nsk;
1099                 }
1100                 inet_twsk_put((struct inet_timewait_sock *)nsk);
1101                 return NULL;
1102         }
1103
1104 #ifdef CONFIG_SYN_COOKIES
1105         if (!th->rst && !th->syn && th->ack)
1106                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1107 #endif
1108         return sk;
1109 }
1110
1111 static int tcp_v4_checksum_init(struct sk_buff *skb)
1112 {
1113         if (skb->ip_summed == CHECKSUM_HW) {
1114                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1115                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1116                                   skb->nh.iph->daddr, skb->csum))
1117                         return 0;
1118
1119                 LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n");
1120                 skb->ip_summed = CHECKSUM_NONE;
1121         }
1122         if (skb->len <= 76) {
1123                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1124                                  skb->nh.iph->daddr,
1125                                  skb_checksum(skb, 0, skb->len, 0)))
1126                         return -1;
1127                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1128         } else {
1129                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1130                                           skb->nh.iph->saddr,
1131                                           skb->nh.iph->daddr, 0);
1132         }
1133         return 0;
1134 }
1135
1136
1137 /* The socket must have it's spinlock held when we get
1138  * here.
1139  *
1140  * We have a potential double-lock case here, so even when
1141  * doing backlog processing we use the BH locking scheme.
1142  * This is because we cannot sleep with the original spinlock
1143  * held.
1144  */
1145 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1146 {
1147         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1148                 TCP_CHECK_TIMER(sk);
1149                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1150                         goto reset;
1151                 TCP_CHECK_TIMER(sk);
1152                 return 0;
1153         }
1154
1155         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1156                 goto csum_err;
1157
1158         if (sk->sk_state == TCP_LISTEN) {
1159                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1160                 if (!nsk)
1161                         goto discard;
1162
1163                 if (nsk != sk) {
1164                         if (tcp_child_process(sk, nsk, skb))
1165                                 goto reset;
1166                         return 0;
1167                 }
1168         }
1169
1170         TCP_CHECK_TIMER(sk);
1171         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1172                 goto reset;
1173         TCP_CHECK_TIMER(sk);
1174         return 0;
1175
1176 reset:
1177         tcp_v4_send_reset(skb);
1178 discard:
1179         kfree_skb(skb);
1180         /* Be careful here. If this function gets more complicated and
1181          * gcc suffers from register pressure on the x86, sk (in %ebx)
1182          * might be destroyed here. This current version compiles correctly,
1183          * but you have been warned.
1184          */
1185         return 0;
1186
1187 csum_err:
1188         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1189         goto discard;
1190 }
1191
1192 /*
1193  *      From tcp_input.c
1194  */
1195
1196 int tcp_v4_rcv(struct sk_buff *skb)
1197 {
1198         struct tcphdr *th;
1199         struct sock *sk;
1200         int ret;
1201
1202         if (skb->pkt_type != PACKET_HOST)
1203                 goto discard_it;
1204
1205         /* Count it even if it's bad */
1206         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1207
1208         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1209                 goto discard_it;
1210
1211         th = skb->h.th;
1212
1213         if (th->doff < sizeof(struct tcphdr) / 4)
1214                 goto bad_packet;
1215         if (!pskb_may_pull(skb, th->doff * 4))
1216                 goto discard_it;
1217
1218         /* An explanation is required here, I think.
1219          * Packet length and doff are validated by header prediction,
1220          * provided case of th->doff==0 is elimineted.
1221          * So, we defer the checks. */
1222         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1223              tcp_v4_checksum_init(skb) < 0))
1224                 goto bad_packet;
1225
1226         th = skb->h.th;
1227         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1228         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1229                                     skb->len - th->doff * 4);
1230         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1231         TCP_SKB_CB(skb)->when    = 0;
1232         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1233         TCP_SKB_CB(skb)->sacked  = 0;
1234
1235         sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1236                            skb->nh.iph->daddr, ntohs(th->dest),
1237                            inet_iif(skb));
1238
1239         if (!sk)
1240                 goto no_tcp_socket;
1241
1242 process:
1243         if (sk->sk_state == TCP_TIME_WAIT)
1244                 goto do_time_wait;
1245
1246         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1247                 goto discard_and_relse;
1248
1249         if (sk_filter(sk, skb, 0))
1250                 goto discard_and_relse;
1251
1252         skb->dev = NULL;
1253
1254         bh_lock_sock(sk);
1255         ret = 0;
1256         if (!sock_owned_by_user(sk)) {
1257                 if (!tcp_prequeue(sk, skb))
1258                         ret = tcp_v4_do_rcv(sk, skb);
1259         } else
1260                 sk_add_backlog(sk, skb);
1261         bh_unlock_sock(sk);
1262
1263         sock_put(sk);
1264
1265         return ret;
1266
1267 no_tcp_socket:
1268         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1269                 goto discard_it;
1270
1271         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1272 bad_packet:
1273                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1274         } else {
1275                 tcp_v4_send_reset(skb);
1276         }
1277
1278 discard_it:
1279         /* Discard frame. */
1280         kfree_skb(skb);
1281         return 0;
1282
1283 discard_and_relse:
1284         sock_put(sk);
1285         goto discard_it;
1286
1287 do_time_wait:
1288         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1289                 inet_twsk_put((struct inet_timewait_sock *) sk);
1290                 goto discard_it;
1291         }
1292
1293         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1294                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1295                 inet_twsk_put((struct inet_timewait_sock *) sk);
1296                 goto discard_it;
1297         }
1298         switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1299                                            skb, th)) {
1300         case TCP_TW_SYN: {
1301                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1302                                                         skb->nh.iph->daddr,
1303                                                         ntohs(th->dest),
1304                                                         inet_iif(skb));
1305                 if (sk2) {
1306                         inet_twsk_deschedule((struct inet_timewait_sock *)sk,
1307                                              &tcp_death_row);
1308                         inet_twsk_put((struct inet_timewait_sock *)sk);
1309                         sk = sk2;
1310                         goto process;
1311                 }
1312                 /* Fall through to ACK */
1313         }
1314         case TCP_TW_ACK:
1315                 tcp_v4_timewait_ack(sk, skb);
1316                 break;
1317         case TCP_TW_RST:
1318                 goto no_tcp_socket;
1319         case TCP_TW_SUCCESS:;
1320         }
1321         goto discard_it;
1322 }
1323
1324 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1325 {
1326         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1327         struct inet_sock *inet = inet_sk(sk);
1328
1329         sin->sin_family         = AF_INET;
1330         sin->sin_addr.s_addr    = inet->daddr;
1331         sin->sin_port           = inet->dport;
1332 }
1333
1334 /* VJ's idea. Save last timestamp seen from this destination
1335  * and hold it at least for normal timewait interval to use for duplicate
1336  * segment detection in subsequent connections, before they enter synchronized
1337  * state.
1338  */
1339
1340 int tcp_v4_remember_stamp(struct sock *sk)
1341 {
1342         struct inet_sock *inet = inet_sk(sk);
1343         struct tcp_sock *tp = tcp_sk(sk);
1344         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1345         struct inet_peer *peer = NULL;
1346         int release_it = 0;
1347
1348         if (!rt || rt->rt_dst != inet->daddr) {
1349                 peer = inet_getpeer(inet->daddr, 1);
1350                 release_it = 1;
1351         } else {
1352                 if (!rt->peer)
1353                         rt_bind_peer(rt, 1);
1354                 peer = rt->peer;
1355         }
1356
1357         if (peer) {
1358                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1359                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1360                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1361                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1362                         peer->tcp_ts = tp->rx_opt.ts_recent;
1363                 }
1364                 if (release_it)
1365                         inet_putpeer(peer);
1366                 return 1;
1367         }
1368
1369         return 0;
1370 }
1371
1372 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1373 {
1374         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1375
1376         if (peer) {
1377                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1378
1379                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1380                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1381                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1382                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1383                         peer->tcp_ts       = tcptw->tw_ts_recent;
1384                 }
1385                 inet_putpeer(peer);
1386                 return 1;
1387         }
1388
1389         return 0;
1390 }
1391
1392 struct tcp_func ipv4_specific = {
1393         .queue_xmit     =       ip_queue_xmit,
1394         .send_check     =       tcp_v4_send_check,
1395         .rebuild_header =       inet_sk_rebuild_header,
1396         .conn_request   =       tcp_v4_conn_request,
1397         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
1398         .remember_stamp =       tcp_v4_remember_stamp,
1399         .net_header_len =       sizeof(struct iphdr),
1400         .setsockopt     =       ip_setsockopt,
1401         .getsockopt     =       ip_getsockopt,
1402         .addr2sockaddr  =       v4_addr2sockaddr,
1403         .sockaddr_len   =       sizeof(struct sockaddr_in),
1404 };
1405
1406 /* NOTE: A lot of things set to zero explicitly by call to
1407  *       sk_alloc() so need not be done here.
1408  */
1409 static int tcp_v4_init_sock(struct sock *sk)
1410 {
1411         struct inet_connection_sock *icsk = inet_csk(sk);
1412         struct tcp_sock *tp = tcp_sk(sk);
1413
1414         skb_queue_head_init(&tp->out_of_order_queue);
1415         tcp_init_xmit_timers(sk);
1416         tcp_prequeue_init(tp);
1417
1418         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1419         tp->mdev = TCP_TIMEOUT_INIT;
1420
1421         /* So many TCP implementations out there (incorrectly) count the
1422          * initial SYN frame in their delayed-ACK and congestion control
1423          * algorithms that we must have the following bandaid to talk
1424          * efficiently to them.  -DaveM
1425          */
1426         tp->snd_cwnd = 2;
1427
1428         /* See draft-stevens-tcpca-spec-01 for discussion of the
1429          * initialization of these values.
1430          */
1431         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1432         tp->snd_cwnd_clamp = ~0;
1433         tp->mss_cache = 536;
1434
1435         tp->reordering = sysctl_tcp_reordering;
1436         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1437
1438         sk->sk_state = TCP_CLOSE;
1439
1440         sk->sk_write_space = sk_stream_write_space;
1441         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1442
1443         tp->af_specific = &ipv4_specific;
1444
1445         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1446         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1447
1448         atomic_inc(&tcp_sockets_allocated);
1449
1450         return 0;
1451 }
1452
1453 int tcp_v4_destroy_sock(struct sock *sk)
1454 {
1455         struct tcp_sock *tp = tcp_sk(sk);
1456
1457         tcp_clear_xmit_timers(sk);
1458
1459         tcp_cleanup_congestion_control(sk);
1460
1461         /* Cleanup up the write buffer. */
1462         sk_stream_writequeue_purge(sk);
1463
1464         /* Cleans up our, hopefully empty, out_of_order_queue. */
1465         __skb_queue_purge(&tp->out_of_order_queue);
1466
1467         /* Clean prequeue, it must be empty really */
1468         __skb_queue_purge(&tp->ucopy.prequeue);
1469
1470         /* Clean up a referenced TCP bind bucket. */
1471         if (inet_csk(sk)->icsk_bind_hash)
1472                 inet_put_port(&tcp_hashinfo, sk);
1473
1474         /*
1475          * If sendmsg cached page exists, toss it.
1476          */
1477         if (sk->sk_sndmsg_page) {
1478                 __free_page(sk->sk_sndmsg_page);
1479                 sk->sk_sndmsg_page = NULL;
1480         }
1481
1482         atomic_dec(&tcp_sockets_allocated);
1483
1484         return 0;
1485 }
1486
1487 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1488
1489 #ifdef CONFIG_PROC_FS
1490 /* Proc filesystem TCP sock list dumping. */
1491
1492 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1493 {
1494         return hlist_empty(head) ? NULL :
1495                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1496 }
1497
1498 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1499 {
1500         return tw->tw_node.next ?
1501                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1502 }
1503
1504 static void *listening_get_next(struct seq_file *seq, void *cur)
1505 {
1506         struct inet_connection_sock *icsk;
1507         struct hlist_node *node;
1508         struct sock *sk = cur;
1509         struct tcp_iter_state* st = seq->private;
1510
1511         if (!sk) {
1512                 st->bucket = 0;
1513                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1514                 goto get_sk;
1515         }
1516
1517         ++st->num;
1518
1519         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1520                 struct request_sock *req = cur;
1521
1522                 icsk = inet_csk(st->syn_wait_sk);
1523                 req = req->dl_next;
1524                 while (1) {
1525                         while (req) {
1526                                 if (req->rsk_ops->family == st->family) {
1527                                         cur = req;
1528                                         goto out;
1529                                 }
1530                                 req = req->dl_next;
1531                         }
1532                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
1533                                 break;
1534 get_req:
1535                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1536                 }
1537                 sk        = sk_next(st->syn_wait_sk);
1538                 st->state = TCP_SEQ_STATE_LISTENING;
1539                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1540         } else {
1541                 icsk = inet_csk(sk);
1542                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1543                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1544                         goto start_req;
1545                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1546                 sk = sk_next(sk);
1547         }
1548 get_sk:
1549         sk_for_each_from(sk, node) {
1550                 if (sk->sk_family == st->family) {
1551                         cur = sk;
1552                         goto out;
1553                 }
1554                 icsk = inet_csk(sk);
1555                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1556                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1557 start_req:
1558                         st->uid         = sock_i_uid(sk);
1559                         st->syn_wait_sk = sk;
1560                         st->state       = TCP_SEQ_STATE_OPENREQ;
1561                         st->sbucket     = 0;
1562                         goto get_req;
1563                 }
1564                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1565         }
1566         if (++st->bucket < INET_LHTABLE_SIZE) {
1567                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1568                 goto get_sk;
1569         }
1570         cur = NULL;
1571 out:
1572         return cur;
1573 }
1574
1575 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1576 {
1577         void *rc = listening_get_next(seq, NULL);
1578
1579         while (rc && *pos) {
1580                 rc = listening_get_next(seq, rc);
1581                 --*pos;
1582         }
1583         return rc;
1584 }
1585
1586 static void *established_get_first(struct seq_file *seq)
1587 {
1588         struct tcp_iter_state* st = seq->private;
1589         void *rc = NULL;
1590
1591         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1592                 struct sock *sk;
1593                 struct hlist_node *node;
1594                 struct inet_timewait_sock *tw;
1595
1596                 /* We can reschedule _before_ having picked the target: */
1597                 cond_resched_softirq();
1598
1599                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1600                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1601                         if (sk->sk_family != st->family) {
1602                                 continue;
1603                         }
1604                         rc = sk;
1605                         goto out;
1606                 }
1607                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1608                 inet_twsk_for_each(tw, node,
1609                                    &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1610                         if (tw->tw_family != st->family) {
1611                                 continue;
1612                         }
1613                         rc = tw;
1614                         goto out;
1615                 }
1616                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1617                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1618         }
1619 out:
1620         return rc;
1621 }
1622
1623 static void *established_get_next(struct seq_file *seq, void *cur)
1624 {
1625         struct sock *sk = cur;
1626         struct inet_timewait_sock *tw;
1627         struct hlist_node *node;
1628         struct tcp_iter_state* st = seq->private;
1629
1630         ++st->num;
1631
1632         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1633                 tw = cur;
1634                 tw = tw_next(tw);
1635 get_tw:
1636                 while (tw && tw->tw_family != st->family) {
1637                         tw = tw_next(tw);
1638                 }
1639                 if (tw) {
1640                         cur = tw;
1641                         goto out;
1642                 }
1643                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1644                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1645
1646                 /* We can reschedule between buckets: */
1647                 cond_resched_softirq();
1648
1649                 if (++st->bucket < tcp_hashinfo.ehash_size) {
1650                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1651                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1652                 } else {
1653                         cur = NULL;
1654                         goto out;
1655                 }
1656         } else
1657                 sk = sk_next(sk);
1658
1659         sk_for_each_from(sk, node) {
1660                 if (sk->sk_family == st->family)
1661                         goto found;
1662         }
1663
1664         st->state = TCP_SEQ_STATE_TIME_WAIT;
1665         tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1666         goto get_tw;
1667 found:
1668         cur = sk;
1669 out:
1670         return cur;
1671 }
1672
1673 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1674 {
1675         void *rc = established_get_first(seq);
1676
1677         while (rc && pos) {
1678                 rc = established_get_next(seq, rc);
1679                 --pos;
1680         }
1681         return rc;
1682 }
1683
1684 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1685 {
1686         void *rc;
1687         struct tcp_iter_state* st = seq->private;
1688
1689         inet_listen_lock(&tcp_hashinfo);
1690         st->state = TCP_SEQ_STATE_LISTENING;
1691         rc        = listening_get_idx(seq, &pos);
1692
1693         if (!rc) {
1694                 inet_listen_unlock(&tcp_hashinfo);
1695                 local_bh_disable();
1696                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1697                 rc        = established_get_idx(seq, pos);
1698         }
1699
1700         return rc;
1701 }
1702
1703 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1704 {
1705         struct tcp_iter_state* st = seq->private;
1706         st->state = TCP_SEQ_STATE_LISTENING;
1707         st->num = 0;
1708         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1709 }
1710
1711 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1712 {
1713         void *rc = NULL;
1714         struct tcp_iter_state* st;
1715
1716         if (v == SEQ_START_TOKEN) {
1717                 rc = tcp_get_idx(seq, 0);
1718                 goto out;
1719         }
1720         st = seq->private;
1721
1722         switch (st->state) {
1723         case TCP_SEQ_STATE_OPENREQ:
1724         case TCP_SEQ_STATE_LISTENING:
1725                 rc = listening_get_next(seq, v);
1726                 if (!rc) {
1727                         inet_listen_unlock(&tcp_hashinfo);
1728                         local_bh_disable();
1729                         st->state = TCP_SEQ_STATE_ESTABLISHED;
1730                         rc        = established_get_first(seq);
1731                 }
1732                 break;
1733         case TCP_SEQ_STATE_ESTABLISHED:
1734         case TCP_SEQ_STATE_TIME_WAIT:
1735                 rc = established_get_next(seq, v);
1736                 break;
1737         }
1738 out:
1739         ++*pos;
1740         return rc;
1741 }
1742
1743 static void tcp_seq_stop(struct seq_file *seq, void *v)
1744 {
1745         struct tcp_iter_state* st = seq->private;
1746
1747         switch (st->state) {
1748         case TCP_SEQ_STATE_OPENREQ:
1749                 if (v) {
1750                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1751                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1752                 }
1753         case TCP_SEQ_STATE_LISTENING:
1754                 if (v != SEQ_START_TOKEN)
1755                         inet_listen_unlock(&tcp_hashinfo);
1756                 break;
1757         case TCP_SEQ_STATE_TIME_WAIT:
1758         case TCP_SEQ_STATE_ESTABLISHED:
1759                 if (v)
1760                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1761                 local_bh_enable();
1762                 break;
1763         }
1764 }
1765
1766 static int tcp_seq_open(struct inode *inode, struct file *file)
1767 {
1768         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1769         struct seq_file *seq;
1770         struct tcp_iter_state *s;
1771         int rc;
1772
1773         if (unlikely(afinfo == NULL))
1774                 return -EINVAL;
1775
1776         s = kmalloc(sizeof(*s), GFP_KERNEL);
1777         if (!s)
1778                 return -ENOMEM;
1779         memset(s, 0, sizeof(*s));
1780         s->family               = afinfo->family;
1781         s->seq_ops.start        = tcp_seq_start;
1782         s->seq_ops.next         = tcp_seq_next;
1783         s->seq_ops.show         = afinfo->seq_show;
1784         s->seq_ops.stop         = tcp_seq_stop;
1785
1786         rc = seq_open(file, &s->seq_ops);
1787         if (rc)
1788                 goto out_kfree;
1789         seq          = file->private_data;
1790         seq->private = s;
1791 out:
1792         return rc;
1793 out_kfree:
1794         kfree(s);
1795         goto out;
1796 }
1797
1798 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
1799 {
1800         int rc = 0;
1801         struct proc_dir_entry *p;
1802
1803         if (!afinfo)
1804                 return -EINVAL;
1805         afinfo->seq_fops->owner         = afinfo->owner;
1806         afinfo->seq_fops->open          = tcp_seq_open;
1807         afinfo->seq_fops->read          = seq_read;
1808         afinfo->seq_fops->llseek        = seq_lseek;
1809         afinfo->seq_fops->release       = seq_release_private;
1810
1811         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1812         if (p)
1813                 p->data = afinfo;
1814         else
1815                 rc = -ENOMEM;
1816         return rc;
1817 }
1818
1819 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
1820 {
1821         if (!afinfo)
1822                 return;
1823         proc_net_remove(afinfo->name);
1824         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
1825 }
1826
1827 static void get_openreq4(struct sock *sk, struct request_sock *req,
1828                          char *tmpbuf, int i, int uid)
1829 {
1830         const struct inet_request_sock *ireq = inet_rsk(req);
1831         int ttd = req->expires - jiffies;
1832
1833         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1834                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
1835                 i,
1836                 ireq->loc_addr,
1837                 ntohs(inet_sk(sk)->sport),
1838                 ireq->rmt_addr,
1839                 ntohs(ireq->rmt_port),
1840                 TCP_SYN_RECV,
1841                 0, 0, /* could print option size, but that is af dependent. */
1842                 1,    /* timers active (only the expire timer) */
1843                 jiffies_to_clock_t(ttd),
1844                 req->retrans,
1845                 uid,
1846                 0,  /* non standard timer */
1847                 0, /* open_requests have no inode */
1848                 atomic_read(&sk->sk_refcnt),
1849                 req);
1850 }
1851
1852 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
1853 {
1854         int timer_active;
1855         unsigned long timer_expires;
1856         struct tcp_sock *tp = tcp_sk(sp);
1857         const struct inet_connection_sock *icsk = inet_csk(sp);
1858         struct inet_sock *inet = inet_sk(sp);
1859         unsigned int dest = inet->daddr;
1860         unsigned int src = inet->rcv_saddr;
1861         __u16 destp = ntohs(inet->dport);
1862         __u16 srcp = ntohs(inet->sport);
1863
1864         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1865                 timer_active    = 1;
1866                 timer_expires   = icsk->icsk_timeout;
1867         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1868                 timer_active    = 4;
1869                 timer_expires   = icsk->icsk_timeout;
1870         } else if (timer_pending(&sp->sk_timer)) {
1871                 timer_active    = 2;
1872                 timer_expires   = sp->sk_timer.expires;
1873         } else {
1874                 timer_active    = 0;
1875                 timer_expires = jiffies;
1876         }
1877
1878         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
1879                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
1880                 i, src, srcp, dest, destp, sp->sk_state,
1881                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
1882                 timer_active,
1883                 jiffies_to_clock_t(timer_expires - jiffies),
1884                 icsk->icsk_retransmits,
1885                 sock_i_uid(sp),
1886                 icsk->icsk_probes_out,
1887                 sock_i_ino(sp),
1888                 atomic_read(&sp->sk_refcnt), sp,
1889                 icsk->icsk_rto,
1890                 icsk->icsk_ack.ato,
1891                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1892                 tp->snd_cwnd,
1893                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
1894 }
1895
1896 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
1897 {
1898         unsigned int dest, src;
1899         __u16 destp, srcp;
1900         int ttd = tw->tw_ttd - jiffies;
1901
1902         if (ttd < 0)
1903                 ttd = 0;
1904
1905         dest  = tw->tw_daddr;
1906         src   = tw->tw_rcv_saddr;
1907         destp = ntohs(tw->tw_dport);
1908         srcp  = ntohs(tw->tw_sport);
1909
1910         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1911                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
1912                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
1913                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
1914                 atomic_read(&tw->tw_refcnt), tw);
1915 }
1916
1917 #define TMPSZ 150
1918
1919 static int tcp4_seq_show(struct seq_file *seq, void *v)
1920 {
1921         struct tcp_iter_state* st;
1922         char tmpbuf[TMPSZ + 1];
1923
1924         if (v == SEQ_START_TOKEN) {
1925                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
1926                            "  sl  local_address rem_address   st tx_queue "
1927                            "rx_queue tr tm->when retrnsmt   uid  timeout "
1928                            "inode");
1929                 goto out;
1930         }
1931         st = seq->private;
1932
1933         switch (st->state) {
1934         case TCP_SEQ_STATE_LISTENING:
1935         case TCP_SEQ_STATE_ESTABLISHED:
1936                 get_tcp4_sock(v, tmpbuf, st->num);
1937                 break;
1938         case TCP_SEQ_STATE_OPENREQ:
1939                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
1940                 break;
1941         case TCP_SEQ_STATE_TIME_WAIT:
1942                 get_timewait4_sock(v, tmpbuf, st->num);
1943                 break;
1944         }
1945         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
1946 out:
1947         return 0;
1948 }
1949
1950 static struct file_operations tcp4_seq_fops;
1951 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1952         .owner          = THIS_MODULE,
1953         .name           = "tcp",
1954         .family         = AF_INET,
1955         .seq_show       = tcp4_seq_show,
1956         .seq_fops       = &tcp4_seq_fops,
1957 };
1958
1959 int __init tcp4_proc_init(void)
1960 {
1961         return tcp_proc_register(&tcp4_seq_afinfo);
1962 }
1963
1964 void tcp4_proc_exit(void)
1965 {
1966         tcp_proc_unregister(&tcp4_seq_afinfo);
1967 }
1968 #endif /* CONFIG_PROC_FS */
1969
1970 struct proto tcp_prot = {
1971         .name                   = "TCP",
1972         .owner                  = THIS_MODULE,
1973         .close                  = tcp_close,
1974         .connect                = tcp_v4_connect,
1975         .disconnect             = tcp_disconnect,
1976         .accept                 = inet_csk_accept,
1977         .ioctl                  = tcp_ioctl,
1978         .init                   = tcp_v4_init_sock,
1979         .destroy                = tcp_v4_destroy_sock,
1980         .shutdown               = tcp_shutdown,
1981         .setsockopt             = tcp_setsockopt,
1982         .getsockopt             = tcp_getsockopt,
1983         .sendmsg                = tcp_sendmsg,
1984         .recvmsg                = tcp_recvmsg,
1985         .backlog_rcv            = tcp_v4_do_rcv,
1986         .hash                   = tcp_v4_hash,
1987         .unhash                 = tcp_unhash,
1988         .get_port               = tcp_v4_get_port,
1989         .enter_memory_pressure  = tcp_enter_memory_pressure,
1990         .sockets_allocated      = &tcp_sockets_allocated,
1991         .orphan_count           = &tcp_orphan_count,
1992         .memory_allocated       = &tcp_memory_allocated,
1993         .memory_pressure        = &tcp_memory_pressure,
1994         .sysctl_mem             = sysctl_tcp_mem,
1995         .sysctl_wmem            = sysctl_tcp_wmem,
1996         .sysctl_rmem            = sysctl_tcp_rmem,
1997         .max_header             = MAX_TCP_HEADER,
1998         .obj_size               = sizeof(struct tcp_sock),
1999         .twsk_obj_size          = sizeof(struct tcp_timewait_sock),
2000         .rsk_prot               = &tcp_request_sock_ops,
2001 };
2002
2003
2004
2005 void __init tcp_v4_init(struct net_proto_family *ops)
2006 {
2007         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2008         if (err < 0)
2009                 panic("Failed to create the TCP control socket.\n");
2010         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2011         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2012
2013         /* Unhash it so that IP input processing does not even
2014          * see it, we do not wish this socket to see incoming
2015          * packets.
2016          */
2017         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2018 }
2019
2020 EXPORT_SYMBOL(ipv4_specific);
2021 EXPORT_SYMBOL(inet_bind_bucket_create);
2022 EXPORT_SYMBOL(tcp_hashinfo);
2023 EXPORT_SYMBOL(tcp_prot);
2024 EXPORT_SYMBOL(tcp_unhash);
2025 EXPORT_SYMBOL(tcp_v4_conn_request);
2026 EXPORT_SYMBOL(tcp_v4_connect);
2027 EXPORT_SYMBOL(tcp_v4_do_rcv);
2028 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2029 EXPORT_SYMBOL(tcp_v4_send_check);
2030 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2031
2032 #ifdef CONFIG_PROC_FS
2033 EXPORT_SYMBOL(tcp_proc_register);
2034 EXPORT_SYMBOL(tcp_proc_unregister);
2035 #endif
2036 EXPORT_SYMBOL(sysctl_local_port_range);
2037 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2038 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2039