Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wirel...
[linux-2.6-block.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/netdma.h>
76 #include <net/secure_seq.h>
77 #include <net/tcp_memcontrol.h>
78
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
84
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
87
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
91
92
93 #ifdef CONFIG_TCP_MD5SIG
94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
95                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
96 #endif
97
98 struct inet_hashinfo tcp_hashinfo;
99 EXPORT_SYMBOL(tcp_hashinfo);
100
101 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
102 {
103         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
104                                           ip_hdr(skb)->saddr,
105                                           tcp_hdr(skb)->dest,
106                                           tcp_hdr(skb)->source);
107 }
108
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112         struct tcp_sock *tp = tcp_sk(sk);
113
114         /* With PAWS, it is safe from the viewpoint
115            of data integrity. Even without PAWS it is safe provided sequence
116            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
117
118            Actually, the idea is close to VJ's one, only timestamp cache is
119            held not per host, but per port pair and TW bucket is used as state
120            holder.
121
122            If TW bucket has been already destroyed we fall back to VJ's scheme
123            and use initial timestamp retrieved from peer table.
124          */
125         if (tcptw->tw_ts_recent_stamp &&
126             (twp == NULL || (sysctl_tcp_tw_reuse &&
127                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
128                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
129                 if (tp->write_seq == 0)
130                         tp->write_seq = 1;
131                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
132                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
133                 sock_hold(sktw);
134                 return 1;
135         }
136
137         return 0;
138 }
139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
140
141 /* This will initiate an outgoing connection. */
142 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
143 {
144         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
145         struct inet_sock *inet = inet_sk(sk);
146         struct tcp_sock *tp = tcp_sk(sk);
147         __be16 orig_sport, orig_dport;
148         __be32 daddr, nexthop;
149         struct flowi4 *fl4;
150         struct rtable *rt;
151         int err;
152         struct ip_options_rcu *inet_opt;
153
154         if (addr_len < sizeof(struct sockaddr_in))
155                 return -EINVAL;
156
157         if (usin->sin_family != AF_INET)
158                 return -EAFNOSUPPORT;
159
160         nexthop = daddr = usin->sin_addr.s_addr;
161         inet_opt = rcu_dereference_protected(inet->inet_opt,
162                                              sock_owned_by_user(sk));
163         if (inet_opt && inet_opt->opt.srr) {
164                 if (!daddr)
165                         return -EINVAL;
166                 nexthop = inet_opt->opt.faddr;
167         }
168
169         orig_sport = inet->inet_sport;
170         orig_dport = usin->sin_port;
171         fl4 = &inet->cork.fl.u.ip4;
172         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
173                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
174                               IPPROTO_TCP,
175                               orig_sport, orig_dport, sk, true);
176         if (IS_ERR(rt)) {
177                 err = PTR_ERR(rt);
178                 if (err == -ENETUNREACH)
179                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
180                 return err;
181         }
182
183         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
184                 ip_rt_put(rt);
185                 return -ENETUNREACH;
186         }
187
188         if (!inet_opt || !inet_opt->opt.srr)
189                 daddr = fl4->daddr;
190
191         if (!inet->inet_saddr)
192                 inet->inet_saddr = fl4->saddr;
193         inet->inet_rcv_saddr = inet->inet_saddr;
194
195         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
196                 /* Reset inherited state */
197                 tp->rx_opt.ts_recent       = 0;
198                 tp->rx_opt.ts_recent_stamp = 0;
199                 if (likely(!tp->repair))
200                         tp->write_seq      = 0;
201         }
202
203         if (tcp_death_row.sysctl_tw_recycle &&
204             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
205                 tcp_fetch_timewait_stamp(sk, &rt->dst);
206
207         inet->inet_dport = usin->sin_port;
208         inet->inet_daddr = daddr;
209
210         inet_csk(sk)->icsk_ext_hdr_len = 0;
211         if (inet_opt)
212                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
213
214         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
215
216         /* Socket identity is still unknown (sport may be zero).
217          * However we set state to SYN-SENT and not releasing socket
218          * lock select source port, enter ourselves into the hash tables and
219          * complete initialization after this.
220          */
221         tcp_set_state(sk, TCP_SYN_SENT);
222         err = inet_hash_connect(&tcp_death_row, sk);
223         if (err)
224                 goto failure;
225
226         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227                                inet->inet_sport, inet->inet_dport, sk);
228         if (IS_ERR(rt)) {
229                 err = PTR_ERR(rt);
230                 rt = NULL;
231                 goto failure;
232         }
233         /* OK, now commit destination to socket.  */
234         sk->sk_gso_type = SKB_GSO_TCPV4;
235         sk_setup_caps(sk, &rt->dst);
236
237         if (!tp->write_seq && likely(!tp->repair))
238                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239                                                            inet->inet_daddr,
240                                                            inet->inet_sport,
241                                                            usin->sin_port);
242
243         inet->inet_id = tp->write_seq ^ jiffies;
244
245         err = tcp_connect(sk);
246
247         rt = NULL;
248         if (err)
249                 goto failure;
250
251         return 0;
252
253 failure:
254         /*
255          * This unhashes the socket and releases the local port,
256          * if necessary.
257          */
258         tcp_set_state(sk, TCP_CLOSE);
259         ip_rt_put(rt);
260         sk->sk_route_caps = 0;
261         inet->inet_dport = 0;
262         return err;
263 }
264 EXPORT_SYMBOL(tcp_v4_connect);
265
266 /*
267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268  * It can be called through tcp_release_cb() if socket was owned by user
269  * at the time tcp_v4_err() was called to handle ICMP message.
270  */
271 static void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273         struct dst_entry *dst;
274         struct inet_sock *inet = inet_sk(sk);
275         u32 mtu = tcp_sk(sk)->mtu_info;
276
277         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
278          * send out by Linux are always <576bytes so they should go through
279          * unfragmented).
280          */
281         if (sk->sk_state == TCP_LISTEN)
282                 return;
283
284         dst = inet_csk_update_pmtu(sk, mtu);
285         if (!dst)
286                 return;
287
288         /* Something is about to be wrong... Remember soft error
289          * for the case, if this connection will not able to recover.
290          */
291         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
292                 sk->sk_err_soft = EMSGSIZE;
293
294         mtu = dst_mtu(dst);
295
296         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
297             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
298                 tcp_sync_mss(sk, mtu);
299
300                 /* Resend the TCP packet because it's
301                  * clear that the old packet has been
302                  * dropped. This is the new "fast" path mtu
303                  * discovery.
304                  */
305                 tcp_simple_retransmit(sk);
306         } /* else let the usual retransmit timer handle it */
307 }
308
309 static void do_redirect(struct sk_buff *skb, struct sock *sk)
310 {
311         struct dst_entry *dst = __sk_dst_check(sk, 0);
312
313         if (dst)
314                 dst->ops->redirect(dst, sk, skb);
315 }
316
317 /*
318  * This routine is called by the ICMP module when it gets some
319  * sort of error condition.  If err < 0 then the socket should
320  * be closed and the error returned to the user.  If err > 0
321  * it's just the icmp type << 8 | icmp code.  After adjustment
322  * header points to the first 8 bytes of the tcp header.  We need
323  * to find the appropriate port.
324  *
325  * The locking strategy used here is very "optimistic". When
326  * someone else accesses the socket the ICMP is just dropped
327  * and for some paths there is no check at all.
328  * A more general error queue to queue errors for later handling
329  * is probably better.
330  *
331  */
332
333 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
334 {
335         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
336         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
337         struct inet_connection_sock *icsk;
338         struct tcp_sock *tp;
339         struct inet_sock *inet;
340         const int type = icmp_hdr(icmp_skb)->type;
341         const int code = icmp_hdr(icmp_skb)->code;
342         struct sock *sk;
343         struct sk_buff *skb;
344         struct request_sock *req;
345         __u32 seq;
346         __u32 remaining;
347         int err;
348         struct net *net = dev_net(icmp_skb->dev);
349
350         if (icmp_skb->len < (iph->ihl << 2) + 8) {
351                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
352                 return;
353         }
354
355         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
356                         iph->saddr, th->source, inet_iif(icmp_skb));
357         if (!sk) {
358                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
359                 return;
360         }
361         if (sk->sk_state == TCP_TIME_WAIT) {
362                 inet_twsk_put(inet_twsk(sk));
363                 return;
364         }
365
366         bh_lock_sock(sk);
367         /* If too many ICMPs get dropped on busy
368          * servers this needs to be solved differently.
369          * We do take care of PMTU discovery (RFC1191) special case :
370          * we can receive locally generated ICMP messages while socket is held.
371          */
372         if (sock_owned_by_user(sk)) {
373                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
374                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
375         }
376         if (sk->sk_state == TCP_CLOSE)
377                 goto out;
378
379         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
380                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
381                 goto out;
382         }
383
384         icsk = inet_csk(sk);
385         tp = tcp_sk(sk);
386         req = tp->fastopen_rsk;
387         seq = ntohl(th->seq);
388         if (sk->sk_state != TCP_LISTEN &&
389             !between(seq, tp->snd_una, tp->snd_nxt) &&
390             (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
391                 /* For a Fast Open socket, allow seq to be snt_isn. */
392                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
393                 goto out;
394         }
395
396         switch (type) {
397         case ICMP_REDIRECT:
398                 do_redirect(icmp_skb, sk);
399                 goto out;
400         case ICMP_SOURCE_QUENCH:
401                 /* Just silently ignore these. */
402                 goto out;
403         case ICMP_PARAMETERPROB:
404                 err = EPROTO;
405                 break;
406         case ICMP_DEST_UNREACH:
407                 if (code > NR_ICMP_UNREACH)
408                         goto out;
409
410                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
411                         tp->mtu_info = info;
412                         if (!sock_owned_by_user(sk)) {
413                                 tcp_v4_mtu_reduced(sk);
414                         } else {
415                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
416                                         sock_hold(sk);
417                         }
418                         goto out;
419                 }
420
421                 err = icmp_err_convert[code].errno;
422                 /* check if icmp_skb allows revert of backoff
423                  * (see draft-zimmermann-tcp-lcd) */
424                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
425                         break;
426                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
427                     !icsk->icsk_backoff)
428                         break;
429
430                 /* XXX (TFO) - revisit the following logic for TFO */
431
432                 if (sock_owned_by_user(sk))
433                         break;
434
435                 icsk->icsk_backoff--;
436                 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
437                         TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
438                 tcp_bound_rto(sk);
439
440                 skb = tcp_write_queue_head(sk);
441                 BUG_ON(!skb);
442
443                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
444                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
445
446                 if (remaining) {
447                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
448                                                   remaining, TCP_RTO_MAX);
449                 } else {
450                         /* RTO revert clocked out retransmission.
451                          * Will retransmit now */
452                         tcp_retransmit_timer(sk);
453                 }
454
455                 break;
456         case ICMP_TIME_EXCEEDED:
457                 err = EHOSTUNREACH;
458                 break;
459         default:
460                 goto out;
461         }
462
463         /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
464          * than following the TCP_SYN_RECV case and closing the socket,
465          * we ignore the ICMP error and keep trying like a fully established
466          * socket. Is this the right thing to do?
467          */
468         if (req && req->sk == NULL)
469                 goto out;
470
471         switch (sk->sk_state) {
472                 struct request_sock *req, **prev;
473         case TCP_LISTEN:
474                 if (sock_owned_by_user(sk))
475                         goto out;
476
477                 req = inet_csk_search_req(sk, &prev, th->dest,
478                                           iph->daddr, iph->saddr);
479                 if (!req)
480                         goto out;
481
482                 /* ICMPs are not backlogged, hence we cannot get
483                    an established socket here.
484                  */
485                 WARN_ON(req->sk);
486
487                 if (seq != tcp_rsk(req)->snt_isn) {
488                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
489                         goto out;
490                 }
491
492                 /*
493                  * Still in SYN_RECV, just remove it silently.
494                  * There is no good way to pass the error to the newly
495                  * created socket, and POSIX does not want network
496                  * errors returned from accept().
497                  */
498                 inet_csk_reqsk_queue_drop(sk, req, prev);
499                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
500                 goto out;
501
502         case TCP_SYN_SENT:
503         case TCP_SYN_RECV:  /* Cannot happen.
504                                It can f.e. if SYNs crossed,
505                                or Fast Open.
506                              */
507                 if (!sock_owned_by_user(sk)) {
508                         sk->sk_err = err;
509
510                         sk->sk_error_report(sk);
511
512                         tcp_done(sk);
513                 } else {
514                         sk->sk_err_soft = err;
515                 }
516                 goto out;
517         }
518
519         /* If we've already connected we will keep trying
520          * until we time out, or the user gives up.
521          *
522          * rfc1122 4.2.3.9 allows to consider as hard errors
523          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
524          * but it is obsoleted by pmtu discovery).
525          *
526          * Note, that in modern internet, where routing is unreliable
527          * and in each dark corner broken firewalls sit, sending random
528          * errors ordered by their masters even this two messages finally lose
529          * their original sense (even Linux sends invalid PORT_UNREACHs)
530          *
531          * Now we are in compliance with RFCs.
532          *                                                      --ANK (980905)
533          */
534
535         inet = inet_sk(sk);
536         if (!sock_owned_by_user(sk) && inet->recverr) {
537                 sk->sk_err = err;
538                 sk->sk_error_report(sk);
539         } else  { /* Only an error on timeout */
540                 sk->sk_err_soft = err;
541         }
542
543 out:
544         bh_unlock_sock(sk);
545         sock_put(sk);
546 }
547
548 static void __tcp_v4_send_check(struct sk_buff *skb,
549                                 __be32 saddr, __be32 daddr)
550 {
551         struct tcphdr *th = tcp_hdr(skb);
552
553         if (skb->ip_summed == CHECKSUM_PARTIAL) {
554                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
555                 skb->csum_start = skb_transport_header(skb) - skb->head;
556                 skb->csum_offset = offsetof(struct tcphdr, check);
557         } else {
558                 th->check = tcp_v4_check(skb->len, saddr, daddr,
559                                          csum_partial(th,
560                                                       th->doff << 2,
561                                                       skb->csum));
562         }
563 }
564
565 /* This routine computes an IPv4 TCP checksum. */
566 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
567 {
568         const struct inet_sock *inet = inet_sk(sk);
569
570         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
571 }
572 EXPORT_SYMBOL(tcp_v4_send_check);
573
574 int tcp_v4_gso_send_check(struct sk_buff *skb)
575 {
576         const struct iphdr *iph;
577         struct tcphdr *th;
578
579         if (!pskb_may_pull(skb, sizeof(*th)))
580                 return -EINVAL;
581
582         iph = ip_hdr(skb);
583         th = tcp_hdr(skb);
584
585         th->check = 0;
586         skb->ip_summed = CHECKSUM_PARTIAL;
587         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
588         return 0;
589 }
590
591 /*
592  *      This routine will send an RST to the other tcp.
593  *
594  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
595  *                    for reset.
596  *      Answer: if a packet caused RST, it is not for a socket
597  *              existing in our system, if it is matched to a socket,
598  *              it is just duplicate segment or bug in other side's TCP.
599  *              So that we build reply only basing on parameters
600  *              arrived with segment.
601  *      Exception: precedence violation. We do not implement it in any case.
602  */
603
604 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
605 {
606         const struct tcphdr *th = tcp_hdr(skb);
607         struct {
608                 struct tcphdr th;
609 #ifdef CONFIG_TCP_MD5SIG
610                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
611 #endif
612         } rep;
613         struct ip_reply_arg arg;
614 #ifdef CONFIG_TCP_MD5SIG
615         struct tcp_md5sig_key *key;
616         const __u8 *hash_location = NULL;
617         unsigned char newhash[16];
618         int genhash;
619         struct sock *sk1 = NULL;
620 #endif
621         struct net *net;
622
623         /* Never send a reset in response to a reset. */
624         if (th->rst)
625                 return;
626
627         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
628                 return;
629
630         /* Swap the send and the receive. */
631         memset(&rep, 0, sizeof(rep));
632         rep.th.dest   = th->source;
633         rep.th.source = th->dest;
634         rep.th.doff   = sizeof(struct tcphdr) / 4;
635         rep.th.rst    = 1;
636
637         if (th->ack) {
638                 rep.th.seq = th->ack_seq;
639         } else {
640                 rep.th.ack = 1;
641                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
642                                        skb->len - (th->doff << 2));
643         }
644
645         memset(&arg, 0, sizeof(arg));
646         arg.iov[0].iov_base = (unsigned char *)&rep;
647         arg.iov[0].iov_len  = sizeof(rep.th);
648
649 #ifdef CONFIG_TCP_MD5SIG
650         hash_location = tcp_parse_md5sig_option(th);
651         if (!sk && hash_location) {
652                 /*
653                  * active side is lost. Try to find listening socket through
654                  * source port, and then find md5 key through listening socket.
655                  * we are not loose security here:
656                  * Incoming packet is checked with md5 hash with finding key,
657                  * no RST generated if md5 hash doesn't match.
658                  */
659                 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
660                                              &tcp_hashinfo, ip_hdr(skb)->saddr,
661                                              th->source, ip_hdr(skb)->daddr,
662                                              ntohs(th->source), inet_iif(skb));
663                 /* don't send rst if it can't find key */
664                 if (!sk1)
665                         return;
666                 rcu_read_lock();
667                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
668                                         &ip_hdr(skb)->saddr, AF_INET);
669                 if (!key)
670                         goto release_sk1;
671
672                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
673                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
674                         goto release_sk1;
675         } else {
676                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
677                                              &ip_hdr(skb)->saddr,
678                                              AF_INET) : NULL;
679         }
680
681         if (key) {
682                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
683                                    (TCPOPT_NOP << 16) |
684                                    (TCPOPT_MD5SIG << 8) |
685                                    TCPOLEN_MD5SIG);
686                 /* Update length and the length the header thinks exists */
687                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
688                 rep.th.doff = arg.iov[0].iov_len / 4;
689
690                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
691                                      key, ip_hdr(skb)->saddr,
692                                      ip_hdr(skb)->daddr, &rep.th);
693         }
694 #endif
695         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
696                                       ip_hdr(skb)->saddr, /* XXX */
697                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
698         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
699         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
700         /* When socket is gone, all binding information is lost.
701          * routing might fail in this case. No choice here, if we choose to force
702          * input interface, we will misroute in case of asymmetric route.
703          */
704         if (sk)
705                 arg.bound_dev_if = sk->sk_bound_dev_if;
706
707         net = dev_net(skb_dst(skb)->dev);
708         arg.tos = ip_hdr(skb)->tos;
709         ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
710                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
711
712         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
713         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
714
715 #ifdef CONFIG_TCP_MD5SIG
716 release_sk1:
717         if (sk1) {
718                 rcu_read_unlock();
719                 sock_put(sk1);
720         }
721 #endif
722 }
723
724 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
725    outside socket context is ugly, certainly. What can I do?
726  */
727
728 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
729                             u32 win, u32 ts, int oif,
730                             struct tcp_md5sig_key *key,
731                             int reply_flags, u8 tos)
732 {
733         const struct tcphdr *th = tcp_hdr(skb);
734         struct {
735                 struct tcphdr th;
736                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
737 #ifdef CONFIG_TCP_MD5SIG
738                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
739 #endif
740                         ];
741         } rep;
742         struct ip_reply_arg arg;
743         struct net *net = dev_net(skb_dst(skb)->dev);
744
745         memset(&rep.th, 0, sizeof(struct tcphdr));
746         memset(&arg, 0, sizeof(arg));
747
748         arg.iov[0].iov_base = (unsigned char *)&rep;
749         arg.iov[0].iov_len  = sizeof(rep.th);
750         if (ts) {
751                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
752                                    (TCPOPT_TIMESTAMP << 8) |
753                                    TCPOLEN_TIMESTAMP);
754                 rep.opt[1] = htonl(tcp_time_stamp);
755                 rep.opt[2] = htonl(ts);
756                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
757         }
758
759         /* Swap the send and the receive. */
760         rep.th.dest    = th->source;
761         rep.th.source  = th->dest;
762         rep.th.doff    = arg.iov[0].iov_len / 4;
763         rep.th.seq     = htonl(seq);
764         rep.th.ack_seq = htonl(ack);
765         rep.th.ack     = 1;
766         rep.th.window  = htons(win);
767
768 #ifdef CONFIG_TCP_MD5SIG
769         if (key) {
770                 int offset = (ts) ? 3 : 0;
771
772                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
773                                           (TCPOPT_NOP << 16) |
774                                           (TCPOPT_MD5SIG << 8) |
775                                           TCPOLEN_MD5SIG);
776                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
777                 rep.th.doff = arg.iov[0].iov_len/4;
778
779                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
780                                     key, ip_hdr(skb)->saddr,
781                                     ip_hdr(skb)->daddr, &rep.th);
782         }
783 #endif
784         arg.flags = reply_flags;
785         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
786                                       ip_hdr(skb)->saddr, /* XXX */
787                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
788         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
789         if (oif)
790                 arg.bound_dev_if = oif;
791         arg.tos = tos;
792         ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
793                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
794
795         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
796 }
797
798 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
799 {
800         struct inet_timewait_sock *tw = inet_twsk(sk);
801         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
802
803         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
804                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
805                         tcptw->tw_ts_recent,
806                         tw->tw_bound_dev_if,
807                         tcp_twsk_md5_key(tcptw),
808                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
809                         tw->tw_tos
810                         );
811
812         inet_twsk_put(tw);
813 }
814
815 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
816                                   struct request_sock *req)
817 {
818         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
819          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
820          */
821         tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
822                         tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
823                         tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
824                         req->ts_recent,
825                         0,
826                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
827                                           AF_INET),
828                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
829                         ip_hdr(skb)->tos);
830 }
831
832 /*
833  *      Send a SYN-ACK after having received a SYN.
834  *      This still operates on a request_sock only, not on a big
835  *      socket.
836  */
837 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
838                               struct request_sock *req,
839                               struct request_values *rvp,
840                               u16 queue_mapping,
841                               bool nocache)
842 {
843         const struct inet_request_sock *ireq = inet_rsk(req);
844         struct flowi4 fl4;
845         int err = -1;
846         struct sk_buff * skb;
847
848         /* First, grab a route. */
849         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
850                 return -1;
851
852         skb = tcp_make_synack(sk, dst, req, rvp, NULL);
853
854         if (skb) {
855                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
856
857                 skb_set_queue_mapping(skb, queue_mapping);
858                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
859                                             ireq->rmt_addr,
860                                             ireq->opt);
861                 err = net_xmit_eval(err);
862                 if (!tcp_rsk(req)->snt_synack && !err)
863                         tcp_rsk(req)->snt_synack = tcp_time_stamp;
864         }
865
866         return err;
867 }
868
869 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
870                              struct request_values *rvp)
871 {
872         int res = tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
873
874         if (!res)
875                 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
876         return res;
877 }
878
879 /*
880  *      IPv4 request_sock destructor.
881  */
882 static void tcp_v4_reqsk_destructor(struct request_sock *req)
883 {
884         kfree(inet_rsk(req)->opt);
885 }
886
887 /*
888  * Return true if a syncookie should be sent
889  */
890 bool tcp_syn_flood_action(struct sock *sk,
891                          const struct sk_buff *skb,
892                          const char *proto)
893 {
894         const char *msg = "Dropping request";
895         bool want_cookie = false;
896         struct listen_sock *lopt;
897
898
899
900 #ifdef CONFIG_SYN_COOKIES
901         if (sysctl_tcp_syncookies) {
902                 msg = "Sending cookies";
903                 want_cookie = true;
904                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
905         } else
906 #endif
907                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
908
909         lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
910         if (!lopt->synflood_warned) {
911                 lopt->synflood_warned = 1;
912                 pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
913                         proto, ntohs(tcp_hdr(skb)->dest), msg);
914         }
915         return want_cookie;
916 }
917 EXPORT_SYMBOL(tcp_syn_flood_action);
918
919 /*
920  * Save and compile IPv4 options into the request_sock if needed.
921  */
922 static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
923 {
924         const struct ip_options *opt = &(IPCB(skb)->opt);
925         struct ip_options_rcu *dopt = NULL;
926
927         if (opt && opt->optlen) {
928                 int opt_size = sizeof(*dopt) + opt->optlen;
929
930                 dopt = kmalloc(opt_size, GFP_ATOMIC);
931                 if (dopt) {
932                         if (ip_options_echo(&dopt->opt, skb)) {
933                                 kfree(dopt);
934                                 dopt = NULL;
935                         }
936                 }
937         }
938         return dopt;
939 }
940
941 #ifdef CONFIG_TCP_MD5SIG
942 /*
943  * RFC2385 MD5 checksumming requires a mapping of
944  * IP address->MD5 Key.
945  * We need to maintain these in the sk structure.
946  */
947
948 /* Find the Key structure for an address.  */
949 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
950                                          const union tcp_md5_addr *addr,
951                                          int family)
952 {
953         struct tcp_sock *tp = tcp_sk(sk);
954         struct tcp_md5sig_key *key;
955         struct hlist_node *pos;
956         unsigned int size = sizeof(struct in_addr);
957         struct tcp_md5sig_info *md5sig;
958
959         /* caller either holds rcu_read_lock() or socket lock */
960         md5sig = rcu_dereference_check(tp->md5sig_info,
961                                        sock_owned_by_user(sk) ||
962                                        lockdep_is_held(&sk->sk_lock.slock));
963         if (!md5sig)
964                 return NULL;
965 #if IS_ENABLED(CONFIG_IPV6)
966         if (family == AF_INET6)
967                 size = sizeof(struct in6_addr);
968 #endif
969         hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
970                 if (key->family != family)
971                         continue;
972                 if (!memcmp(&key->addr, addr, size))
973                         return key;
974         }
975         return NULL;
976 }
977 EXPORT_SYMBOL(tcp_md5_do_lookup);
978
979 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
980                                          struct sock *addr_sk)
981 {
982         union tcp_md5_addr *addr;
983
984         addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
985         return tcp_md5_do_lookup(sk, addr, AF_INET);
986 }
987 EXPORT_SYMBOL(tcp_v4_md5_lookup);
988
989 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
990                                                       struct request_sock *req)
991 {
992         union tcp_md5_addr *addr;
993
994         addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
995         return tcp_md5_do_lookup(sk, addr, AF_INET);
996 }
997
998 /* This can be called on a newly created socket, from other files */
999 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1000                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
1001 {
1002         /* Add Key to the list */
1003         struct tcp_md5sig_key *key;
1004         struct tcp_sock *tp = tcp_sk(sk);
1005         struct tcp_md5sig_info *md5sig;
1006
1007         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1008         if (key) {
1009                 /* Pre-existing entry - just update that one. */
1010                 memcpy(key->key, newkey, newkeylen);
1011                 key->keylen = newkeylen;
1012                 return 0;
1013         }
1014
1015         md5sig = rcu_dereference_protected(tp->md5sig_info,
1016                                            sock_owned_by_user(sk));
1017         if (!md5sig) {
1018                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1019                 if (!md5sig)
1020                         return -ENOMEM;
1021
1022                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1023                 INIT_HLIST_HEAD(&md5sig->head);
1024                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1025         }
1026
1027         key = sock_kmalloc(sk, sizeof(*key), gfp);
1028         if (!key)
1029                 return -ENOMEM;
1030         if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1031                 sock_kfree_s(sk, key, sizeof(*key));
1032                 return -ENOMEM;
1033         }
1034
1035         memcpy(key->key, newkey, newkeylen);
1036         key->keylen = newkeylen;
1037         key->family = family;
1038         memcpy(&key->addr, addr,
1039                (family == AF_INET6) ? sizeof(struct in6_addr) :
1040                                       sizeof(struct in_addr));
1041         hlist_add_head_rcu(&key->node, &md5sig->head);
1042         return 0;
1043 }
1044 EXPORT_SYMBOL(tcp_md5_do_add);
1045
1046 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1047 {
1048         struct tcp_sock *tp = tcp_sk(sk);
1049         struct tcp_md5sig_key *key;
1050         struct tcp_md5sig_info *md5sig;
1051
1052         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1053         if (!key)
1054                 return -ENOENT;
1055         hlist_del_rcu(&key->node);
1056         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1057         kfree_rcu(key, rcu);
1058         md5sig = rcu_dereference_protected(tp->md5sig_info,
1059                                            sock_owned_by_user(sk));
1060         if (hlist_empty(&md5sig->head))
1061                 tcp_free_md5sig_pool();
1062         return 0;
1063 }
1064 EXPORT_SYMBOL(tcp_md5_do_del);
1065
1066 static void tcp_clear_md5_list(struct sock *sk)
1067 {
1068         struct tcp_sock *tp = tcp_sk(sk);
1069         struct tcp_md5sig_key *key;
1070         struct hlist_node *pos, *n;
1071         struct tcp_md5sig_info *md5sig;
1072
1073         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1074
1075         if (!hlist_empty(&md5sig->head))
1076                 tcp_free_md5sig_pool();
1077         hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1078                 hlist_del_rcu(&key->node);
1079                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1080                 kfree_rcu(key, rcu);
1081         }
1082 }
1083
1084 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1085                                  int optlen)
1086 {
1087         struct tcp_md5sig cmd;
1088         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1089
1090         if (optlen < sizeof(cmd))
1091                 return -EINVAL;
1092
1093         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1094                 return -EFAULT;
1095
1096         if (sin->sin_family != AF_INET)
1097                 return -EINVAL;
1098
1099         if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1100                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1101                                       AF_INET);
1102
1103         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1104                 return -EINVAL;
1105
1106         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1107                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1108                               GFP_KERNEL);
1109 }
1110
1111 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1112                                         __be32 daddr, __be32 saddr, int nbytes)
1113 {
1114         struct tcp4_pseudohdr *bp;
1115         struct scatterlist sg;
1116
1117         bp = &hp->md5_blk.ip4;
1118
1119         /*
1120          * 1. the TCP pseudo-header (in the order: source IP address,
1121          * destination IP address, zero-padded protocol number, and
1122          * segment length)
1123          */
1124         bp->saddr = saddr;
1125         bp->daddr = daddr;
1126         bp->pad = 0;
1127         bp->protocol = IPPROTO_TCP;
1128         bp->len = cpu_to_be16(nbytes);
1129
1130         sg_init_one(&sg, bp, sizeof(*bp));
1131         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1132 }
1133
1134 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1135                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1136 {
1137         struct tcp_md5sig_pool *hp;
1138         struct hash_desc *desc;
1139
1140         hp = tcp_get_md5sig_pool();
1141         if (!hp)
1142                 goto clear_hash_noput;
1143         desc = &hp->md5_desc;
1144
1145         if (crypto_hash_init(desc))
1146                 goto clear_hash;
1147         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1148                 goto clear_hash;
1149         if (tcp_md5_hash_header(hp, th))
1150                 goto clear_hash;
1151         if (tcp_md5_hash_key(hp, key))
1152                 goto clear_hash;
1153         if (crypto_hash_final(desc, md5_hash))
1154                 goto clear_hash;
1155
1156         tcp_put_md5sig_pool();
1157         return 0;
1158
1159 clear_hash:
1160         tcp_put_md5sig_pool();
1161 clear_hash_noput:
1162         memset(md5_hash, 0, 16);
1163         return 1;
1164 }
1165
1166 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1167                         const struct sock *sk, const struct request_sock *req,
1168                         const struct sk_buff *skb)
1169 {
1170         struct tcp_md5sig_pool *hp;
1171         struct hash_desc *desc;
1172         const struct tcphdr *th = tcp_hdr(skb);
1173         __be32 saddr, daddr;
1174
1175         if (sk) {
1176                 saddr = inet_sk(sk)->inet_saddr;
1177                 daddr = inet_sk(sk)->inet_daddr;
1178         } else if (req) {
1179                 saddr = inet_rsk(req)->loc_addr;
1180                 daddr = inet_rsk(req)->rmt_addr;
1181         } else {
1182                 const struct iphdr *iph = ip_hdr(skb);
1183                 saddr = iph->saddr;
1184                 daddr = iph->daddr;
1185         }
1186
1187         hp = tcp_get_md5sig_pool();
1188         if (!hp)
1189                 goto clear_hash_noput;
1190         desc = &hp->md5_desc;
1191
1192         if (crypto_hash_init(desc))
1193                 goto clear_hash;
1194
1195         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1196                 goto clear_hash;
1197         if (tcp_md5_hash_header(hp, th))
1198                 goto clear_hash;
1199         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1200                 goto clear_hash;
1201         if (tcp_md5_hash_key(hp, key))
1202                 goto clear_hash;
1203         if (crypto_hash_final(desc, md5_hash))
1204                 goto clear_hash;
1205
1206         tcp_put_md5sig_pool();
1207         return 0;
1208
1209 clear_hash:
1210         tcp_put_md5sig_pool();
1211 clear_hash_noput:
1212         memset(md5_hash, 0, 16);
1213         return 1;
1214 }
1215 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1216
1217 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1218 {
1219         /*
1220          * This gets called for each TCP segment that arrives
1221          * so we want to be efficient.
1222          * We have 3 drop cases:
1223          * o No MD5 hash and one expected.
1224          * o MD5 hash and we're not expecting one.
1225          * o MD5 hash and its wrong.
1226          */
1227         const __u8 *hash_location = NULL;
1228         struct tcp_md5sig_key *hash_expected;
1229         const struct iphdr *iph = ip_hdr(skb);
1230         const struct tcphdr *th = tcp_hdr(skb);
1231         int genhash;
1232         unsigned char newhash[16];
1233
1234         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1235                                           AF_INET);
1236         hash_location = tcp_parse_md5sig_option(th);
1237
1238         /* We've parsed the options - do we have a hash? */
1239         if (!hash_expected && !hash_location)
1240                 return false;
1241
1242         if (hash_expected && !hash_location) {
1243                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1244                 return true;
1245         }
1246
1247         if (!hash_expected && hash_location) {
1248                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1249                 return true;
1250         }
1251
1252         /* Okay, so this is hash_expected and hash_location -
1253          * so we need to calculate the checksum.
1254          */
1255         genhash = tcp_v4_md5_hash_skb(newhash,
1256                                       hash_expected,
1257                                       NULL, NULL, skb);
1258
1259         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1260                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1261                                      &iph->saddr, ntohs(th->source),
1262                                      &iph->daddr, ntohs(th->dest),
1263                                      genhash ? " tcp_v4_calc_md5_hash failed"
1264                                      : "");
1265                 return true;
1266         }
1267         return false;
1268 }
1269
1270 #endif
1271
1272 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1273         .family         =       PF_INET,
1274         .obj_size       =       sizeof(struct tcp_request_sock),
1275         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1276         .send_ack       =       tcp_v4_reqsk_send_ack,
1277         .destructor     =       tcp_v4_reqsk_destructor,
1278         .send_reset     =       tcp_v4_send_reset,
1279         .syn_ack_timeout =      tcp_syn_ack_timeout,
1280 };
1281
1282 #ifdef CONFIG_TCP_MD5SIG
1283 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1284         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1285         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1286 };
1287 #endif
1288
1289 static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1290                                struct request_sock *req,
1291                                struct tcp_fastopen_cookie *foc,
1292                                struct tcp_fastopen_cookie *valid_foc)
1293 {
1294         bool skip_cookie = false;
1295         struct fastopen_queue *fastopenq;
1296
1297         if (likely(!fastopen_cookie_present(foc))) {
1298                 /* See include/net/tcp.h for the meaning of these knobs */
1299                 if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1300                     ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1301                     (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1302                         skip_cookie = true; /* no cookie to validate */
1303                 else
1304                         return false;
1305         }
1306         fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1307         /* A FO option is present; bump the counter. */
1308         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1309
1310         /* Make sure the listener has enabled fastopen, and we don't
1311          * exceed the max # of pending TFO requests allowed before trying
1312          * to validating the cookie in order to avoid burning CPU cycles
1313          * unnecessarily.
1314          *
1315          * XXX (TFO) - The implication of checking the max_qlen before
1316          * processing a cookie request is that clients can't differentiate
1317          * between qlen overflow causing Fast Open to be disabled
1318          * temporarily vs a server not supporting Fast Open at all.
1319          */
1320         if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1321             fastopenq == NULL || fastopenq->max_qlen == 0)
1322                 return false;
1323
1324         if (fastopenq->qlen >= fastopenq->max_qlen) {
1325                 struct request_sock *req1;
1326                 spin_lock(&fastopenq->lock);
1327                 req1 = fastopenq->rskq_rst_head;
1328                 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1329                         spin_unlock(&fastopenq->lock);
1330                         NET_INC_STATS_BH(sock_net(sk),
1331                             LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1332                         /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1333                         foc->len = -1;
1334                         return false;
1335                 }
1336                 fastopenq->rskq_rst_head = req1->dl_next;
1337                 fastopenq->qlen--;
1338                 spin_unlock(&fastopenq->lock);
1339                 reqsk_free(req1);
1340         }
1341         if (skip_cookie) {
1342                 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1343                 return true;
1344         }
1345         if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1346                 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1347                         tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1348                         if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1349                             memcmp(&foc->val[0], &valid_foc->val[0],
1350                             TCP_FASTOPEN_COOKIE_SIZE) != 0)
1351                                 return false;
1352                         valid_foc->len = -1;
1353                 }
1354                 /* Acknowledge the data received from the peer. */
1355                 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1356                 return true;
1357         } else if (foc->len == 0) { /* Client requesting a cookie */
1358                 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1359                 NET_INC_STATS_BH(sock_net(sk),
1360                     LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1361         } else {
1362                 /* Client sent a cookie with wrong size. Treat it
1363                  * the same as invalid and return a valid one.
1364                  */
1365                 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1366         }
1367         return false;
1368 }
1369
1370 static int tcp_v4_conn_req_fastopen(struct sock *sk,
1371                                     struct sk_buff *skb,
1372                                     struct sk_buff *skb_synack,
1373                                     struct request_sock *req,
1374                                     struct request_values *rvp)
1375 {
1376         struct tcp_sock *tp = tcp_sk(sk);
1377         struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1378         const struct inet_request_sock *ireq = inet_rsk(req);
1379         struct sock *child;
1380         int err;
1381
1382         req->num_retrans = 0;
1383         req->num_timeout = 0;
1384         req->sk = NULL;
1385
1386         child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1387         if (child == NULL) {
1388                 NET_INC_STATS_BH(sock_net(sk),
1389                                  LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1390                 kfree_skb(skb_synack);
1391                 return -1;
1392         }
1393         err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1394                                     ireq->rmt_addr, ireq->opt);
1395         err = net_xmit_eval(err);
1396         if (!err)
1397                 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1398         /* XXX (TFO) - is it ok to ignore error and continue? */
1399
1400         spin_lock(&queue->fastopenq->lock);
1401         queue->fastopenq->qlen++;
1402         spin_unlock(&queue->fastopenq->lock);
1403
1404         /* Initialize the child socket. Have to fix some values to take
1405          * into account the child is a Fast Open socket and is created
1406          * only out of the bits carried in the SYN packet.
1407          */
1408         tp = tcp_sk(child);
1409
1410         tp->fastopen_rsk = req;
1411         /* Do a hold on the listner sk so that if the listener is being
1412          * closed, the child that has been accepted can live on and still
1413          * access listen_lock.
1414          */
1415         sock_hold(sk);
1416         tcp_rsk(req)->listener = sk;
1417
1418         /* RFC1323: The window in SYN & SYN/ACK segments is never
1419          * scaled. So correct it appropriately.
1420          */
1421         tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1422
1423         /* Activate the retrans timer so that SYNACK can be retransmitted.
1424          * The request socket is not added to the SYN table of the parent
1425          * because it's been added to the accept queue directly.
1426          */
1427         inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1428             TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1429
1430         /* Add the child socket directly into the accept queue */
1431         inet_csk_reqsk_queue_add(sk, req, child);
1432
1433         /* Now finish processing the fastopen child socket. */
1434         inet_csk(child)->icsk_af_ops->rebuild_header(child);
1435         tcp_init_congestion_control(child);
1436         tcp_mtup_init(child);
1437         tcp_init_buffer_space(child);
1438         tcp_init_metrics(child);
1439
1440         /* Queue the data carried in the SYN packet. We need to first
1441          * bump skb's refcnt because the caller will attempt to free it.
1442          *
1443          * XXX (TFO) - we honor a zero-payload TFO request for now.
1444          * (Any reason not to?)
1445          */
1446         if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1447                 /* Don't queue the skb if there is no payload in SYN.
1448                  * XXX (TFO) - How about SYN+FIN?
1449                  */
1450                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1451         } else {
1452                 skb = skb_get(skb);
1453                 skb_dst_drop(skb);
1454                 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1455                 skb_set_owner_r(skb, child);
1456                 __skb_queue_tail(&child->sk_receive_queue, skb);
1457                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1458                 tp->syn_data_acked = 1;
1459         }
1460         sk->sk_data_ready(sk, 0);
1461         bh_unlock_sock(child);
1462         sock_put(child);
1463         WARN_ON(req->sk == NULL);
1464         return 0;
1465 }
1466
1467 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1468 {
1469         struct tcp_extend_values tmp_ext;
1470         struct tcp_options_received tmp_opt;
1471         const u8 *hash_location;
1472         struct request_sock *req;
1473         struct inet_request_sock *ireq;
1474         struct tcp_sock *tp = tcp_sk(sk);
1475         struct dst_entry *dst = NULL;
1476         __be32 saddr = ip_hdr(skb)->saddr;
1477         __be32 daddr = ip_hdr(skb)->daddr;
1478         __u32 isn = TCP_SKB_CB(skb)->when;
1479         bool want_cookie = false;
1480         struct flowi4 fl4;
1481         struct tcp_fastopen_cookie foc = { .len = -1 };
1482         struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1483         struct sk_buff *skb_synack;
1484         int do_fastopen;
1485
1486         /* Never answer to SYNs send to broadcast or multicast */
1487         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1488                 goto drop;
1489
1490         /* TW buckets are converted to open requests without
1491          * limitations, they conserve resources and peer is
1492          * evidently real one.
1493          */
1494         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1495                 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1496                 if (!want_cookie)
1497                         goto drop;
1498         }
1499
1500         /* Accept backlog is full. If we have already queued enough
1501          * of warm entries in syn queue, drop request. It is better than
1502          * clogging syn queue with openreqs with exponentially increasing
1503          * timeout.
1504          */
1505         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
1506                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1507                 goto drop;
1508         }
1509
1510         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1511         if (!req)
1512                 goto drop;
1513
1514 #ifdef CONFIG_TCP_MD5SIG
1515         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1516 #endif
1517
1518         tcp_clear_options(&tmp_opt);
1519         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1520         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1521         tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
1522             want_cookie ? NULL : &foc);
1523
1524         if (tmp_opt.cookie_plus > 0 &&
1525             tmp_opt.saw_tstamp &&
1526             !tp->rx_opt.cookie_out_never &&
1527             (sysctl_tcp_cookie_size > 0 ||
1528              (tp->cookie_values != NULL &&
1529               tp->cookie_values->cookie_desired > 0))) {
1530                 u8 *c;
1531                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1532                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1533
1534                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1535                         goto drop_and_release;
1536
1537                 /* Secret recipe starts with IP addresses */
1538                 *mess++ ^= (__force u32)daddr;
1539                 *mess++ ^= (__force u32)saddr;
1540
1541                 /* plus variable length Initiator Cookie */
1542                 c = (u8 *)mess;
1543                 while (l-- > 0)
1544                         *c++ ^= *hash_location++;
1545
1546                 want_cookie = false;    /* not our kind of cookie */
1547                 tmp_ext.cookie_out_never = 0; /* false */
1548                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1549         } else if (!tp->rx_opt.cookie_in_always) {
1550                 /* redundant indications, but ensure initialization. */
1551                 tmp_ext.cookie_out_never = 1; /* true */
1552                 tmp_ext.cookie_plus = 0;
1553         } else {
1554                 goto drop_and_release;
1555         }
1556         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1557
1558         if (want_cookie && !tmp_opt.saw_tstamp)
1559                 tcp_clear_options(&tmp_opt);
1560
1561         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1562         tcp_openreq_init(req, &tmp_opt, skb);
1563
1564         ireq = inet_rsk(req);
1565         ireq->loc_addr = daddr;
1566         ireq->rmt_addr = saddr;
1567         ireq->no_srccheck = inet_sk(sk)->transparent;
1568         ireq->opt = tcp_v4_save_options(skb);
1569
1570         if (security_inet_conn_request(sk, skb, req))
1571                 goto drop_and_free;
1572
1573         if (!want_cookie || tmp_opt.tstamp_ok)
1574                 TCP_ECN_create_request(req, skb, sock_net(sk));
1575
1576         if (want_cookie) {
1577                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1578                 req->cookie_ts = tmp_opt.tstamp_ok;
1579         } else if (!isn) {
1580                 /* VJ's idea. We save last timestamp seen
1581                  * from the destination in peer table, when entering
1582                  * state TIME-WAIT, and check against it before
1583                  * accepting new connection request.
1584                  *
1585                  * If "isn" is not zero, this request hit alive
1586                  * timewait bucket, so that all the necessary checks
1587                  * are made in the function processing timewait state.
1588                  */
1589                 if (tmp_opt.saw_tstamp &&
1590                     tcp_death_row.sysctl_tw_recycle &&
1591                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1592                     fl4.daddr == saddr) {
1593                         if (!tcp_peer_is_proven(req, dst, true)) {
1594                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1595                                 goto drop_and_release;
1596                         }
1597                 }
1598                 /* Kill the following clause, if you dislike this way. */
1599                 else if (!sysctl_tcp_syncookies &&
1600                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1601                           (sysctl_max_syn_backlog >> 2)) &&
1602                          !tcp_peer_is_proven(req, dst, false)) {
1603                         /* Without syncookies last quarter of
1604                          * backlog is filled with destinations,
1605                          * proven to be alive.
1606                          * It means that we continue to communicate
1607                          * to destinations, already remembered
1608                          * to the moment of synflood.
1609                          */
1610                         LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1611                                        &saddr, ntohs(tcp_hdr(skb)->source));
1612                         goto drop_and_release;
1613                 }
1614
1615                 isn = tcp_v4_init_sequence(skb);
1616         }
1617         tcp_rsk(req)->snt_isn = isn;
1618
1619         if (dst == NULL) {
1620                 dst = inet_csk_route_req(sk, &fl4, req);
1621                 if (dst == NULL)
1622                         goto drop_and_free;
1623         }
1624         do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1625
1626         /* We don't call tcp_v4_send_synack() directly because we need
1627          * to make sure a child socket can be created successfully before
1628          * sending back synack!
1629          *
1630          * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1631          * (or better yet, call tcp_send_synack() in the child context
1632          * directly, but will have to fix bunch of other code first)
1633          * after syn_recv_sock() except one will need to first fix the
1634          * latter to remove its dependency on the current implementation
1635          * of tcp_v4_send_synack()->tcp_select_initial_window().
1636          */
1637         skb_synack = tcp_make_synack(sk, dst, req,
1638             (struct request_values *)&tmp_ext,
1639             fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1640
1641         if (skb_synack) {
1642                 __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1643                 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1644         } else
1645                 goto drop_and_free;
1646
1647         if (likely(!do_fastopen)) {
1648                 int err;
1649                 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1650                      ireq->rmt_addr, ireq->opt);
1651                 err = net_xmit_eval(err);
1652                 if (err || want_cookie)
1653                         goto drop_and_free;
1654
1655                 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1656                 tcp_rsk(req)->listener = NULL;
1657                 /* Add the request_sock to the SYN table */
1658                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1659                 if (fastopen_cookie_present(&foc) && foc.len != 0)
1660                         NET_INC_STATS_BH(sock_net(sk),
1661                             LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1662         } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
1663             (struct request_values *)&tmp_ext))
1664                 goto drop_and_free;
1665
1666         return 0;
1667
1668 drop_and_release:
1669         dst_release(dst);
1670 drop_and_free:
1671         reqsk_free(req);
1672 drop:
1673         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1674         return 0;
1675 }
1676 EXPORT_SYMBOL(tcp_v4_conn_request);
1677
1678
1679 /*
1680  * The three way handshake has completed - we got a valid synack -
1681  * now create the new socket.
1682  */
1683 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1684                                   struct request_sock *req,
1685                                   struct dst_entry *dst)
1686 {
1687         struct inet_request_sock *ireq;
1688         struct inet_sock *newinet;
1689         struct tcp_sock *newtp;
1690         struct sock *newsk;
1691 #ifdef CONFIG_TCP_MD5SIG
1692         struct tcp_md5sig_key *key;
1693 #endif
1694         struct ip_options_rcu *inet_opt;
1695
1696         if (sk_acceptq_is_full(sk))
1697                 goto exit_overflow;
1698
1699         newsk = tcp_create_openreq_child(sk, req, skb);
1700         if (!newsk)
1701                 goto exit_nonewsk;
1702
1703         newsk->sk_gso_type = SKB_GSO_TCPV4;
1704         inet_sk_rx_dst_set(newsk, skb);
1705
1706         newtp                 = tcp_sk(newsk);
1707         newinet               = inet_sk(newsk);
1708         ireq                  = inet_rsk(req);
1709         newinet->inet_daddr   = ireq->rmt_addr;
1710         newinet->inet_rcv_saddr = ireq->loc_addr;
1711         newinet->inet_saddr           = ireq->loc_addr;
1712         inet_opt              = ireq->opt;
1713         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1714         ireq->opt             = NULL;
1715         newinet->mc_index     = inet_iif(skb);
1716         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1717         newinet->rcv_tos      = ip_hdr(skb)->tos;
1718         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1719         if (inet_opt)
1720                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1721         newinet->inet_id = newtp->write_seq ^ jiffies;
1722
1723         if (!dst) {
1724                 dst = inet_csk_route_child_sock(sk, newsk, req);
1725                 if (!dst)
1726                         goto put_and_exit;
1727         } else {
1728                 /* syncookie case : see end of cookie_v4_check() */
1729         }
1730         sk_setup_caps(newsk, dst);
1731
1732         tcp_mtup_init(newsk);
1733         tcp_sync_mss(newsk, dst_mtu(dst));
1734         newtp->advmss = dst_metric_advmss(dst);
1735         if (tcp_sk(sk)->rx_opt.user_mss &&
1736             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1737                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1738
1739         tcp_initialize_rcv_mss(newsk);
1740         tcp_synack_rtt_meas(newsk, req);
1741         newtp->total_retrans = req->num_retrans;
1742
1743 #ifdef CONFIG_TCP_MD5SIG
1744         /* Copy over the MD5 key from the original socket */
1745         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1746                                 AF_INET);
1747         if (key != NULL) {
1748                 /*
1749                  * We're using one, so create a matching key
1750                  * on the newsk structure. If we fail to get
1751                  * memory, then we end up not copying the key
1752                  * across. Shucks.
1753                  */
1754                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1755                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1756                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1757         }
1758 #endif
1759
1760         if (__inet_inherit_port(sk, newsk) < 0)
1761                 goto put_and_exit;
1762         __inet_hash_nolisten(newsk, NULL);
1763
1764         return newsk;
1765
1766 exit_overflow:
1767         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1768 exit_nonewsk:
1769         dst_release(dst);
1770 exit:
1771         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1772         return NULL;
1773 put_and_exit:
1774         inet_csk_prepare_forced_close(newsk);
1775         tcp_done(newsk);
1776         goto exit;
1777 }
1778 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1779
1780 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1781 {
1782         struct tcphdr *th = tcp_hdr(skb);
1783         const struct iphdr *iph = ip_hdr(skb);
1784         struct sock *nsk;
1785         struct request_sock **prev;
1786         /* Find possible connection requests. */
1787         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1788                                                        iph->saddr, iph->daddr);
1789         if (req)
1790                 return tcp_check_req(sk, skb, req, prev, false);
1791
1792         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1793                         th->source, iph->daddr, th->dest, inet_iif(skb));
1794
1795         if (nsk) {
1796                 if (nsk->sk_state != TCP_TIME_WAIT) {
1797                         bh_lock_sock(nsk);
1798                         return nsk;
1799                 }
1800                 inet_twsk_put(inet_twsk(nsk));
1801                 return NULL;
1802         }
1803
1804 #ifdef CONFIG_SYN_COOKIES
1805         if (!th->syn)
1806                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1807 #endif
1808         return sk;
1809 }
1810
1811 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1812 {
1813         const struct iphdr *iph = ip_hdr(skb);
1814
1815         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1816                 if (!tcp_v4_check(skb->len, iph->saddr,
1817                                   iph->daddr, skb->csum)) {
1818                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1819                         return 0;
1820                 }
1821         }
1822
1823         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1824                                        skb->len, IPPROTO_TCP, 0);
1825
1826         if (skb->len <= 76) {
1827                 return __skb_checksum_complete(skb);
1828         }
1829         return 0;
1830 }
1831
1832
1833 /* The socket must have it's spinlock held when we get
1834  * here.
1835  *
1836  * We have a potential double-lock case here, so even when
1837  * doing backlog processing we use the BH locking scheme.
1838  * This is because we cannot sleep with the original spinlock
1839  * held.
1840  */
1841 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1842 {
1843         struct sock *rsk;
1844 #ifdef CONFIG_TCP_MD5SIG
1845         /*
1846          * We really want to reject the packet as early as possible
1847          * if:
1848          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1849          *  o There is an MD5 option and we're not expecting one
1850          */
1851         if (tcp_v4_inbound_md5_hash(sk, skb))
1852                 goto discard;
1853 #endif
1854
1855         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1856                 struct dst_entry *dst = sk->sk_rx_dst;
1857
1858                 sock_rps_save_rxhash(sk, skb);
1859                 if (dst) {
1860                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1861                             dst->ops->check(dst, 0) == NULL) {
1862                                 dst_release(dst);
1863                                 sk->sk_rx_dst = NULL;
1864                         }
1865                 }
1866                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1867                         rsk = sk;
1868                         goto reset;
1869                 }
1870                 return 0;
1871         }
1872
1873         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1874                 goto csum_err;
1875
1876         if (sk->sk_state == TCP_LISTEN) {
1877                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1878                 if (!nsk)
1879                         goto discard;
1880
1881                 if (nsk != sk) {
1882                         sock_rps_save_rxhash(nsk, skb);
1883                         if (tcp_child_process(sk, nsk, skb)) {
1884                                 rsk = nsk;
1885                                 goto reset;
1886                         }
1887                         return 0;
1888                 }
1889         } else
1890                 sock_rps_save_rxhash(sk, skb);
1891
1892         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1893                 rsk = sk;
1894                 goto reset;
1895         }
1896         return 0;
1897
1898 reset:
1899         tcp_v4_send_reset(rsk, skb);
1900 discard:
1901         kfree_skb(skb);
1902         /* Be careful here. If this function gets more complicated and
1903          * gcc suffers from register pressure on the x86, sk (in %ebx)
1904          * might be destroyed here. This current version compiles correctly,
1905          * but you have been warned.
1906          */
1907         return 0;
1908
1909 csum_err:
1910         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1911         goto discard;
1912 }
1913 EXPORT_SYMBOL(tcp_v4_do_rcv);
1914
1915 void tcp_v4_early_demux(struct sk_buff *skb)
1916 {
1917         const struct iphdr *iph;
1918         const struct tcphdr *th;
1919         struct sock *sk;
1920
1921         if (skb->pkt_type != PACKET_HOST)
1922                 return;
1923
1924         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1925                 return;
1926
1927         iph = ip_hdr(skb);
1928         th = tcp_hdr(skb);
1929
1930         if (th->doff < sizeof(struct tcphdr) / 4)
1931                 return;
1932
1933         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1934                                        iph->saddr, th->source,
1935                                        iph->daddr, ntohs(th->dest),
1936                                        skb->skb_iif);
1937         if (sk) {
1938                 skb->sk = sk;
1939                 skb->destructor = sock_edemux;
1940                 if (sk->sk_state != TCP_TIME_WAIT) {
1941                         struct dst_entry *dst = sk->sk_rx_dst;
1942
1943                         if (dst)
1944                                 dst = dst_check(dst, 0);
1945                         if (dst &&
1946                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1947                                 skb_dst_set_noref(skb, dst);
1948                 }
1949         }
1950 }
1951
1952 /*
1953  *      From tcp_input.c
1954  */
1955
1956 int tcp_v4_rcv(struct sk_buff *skb)
1957 {
1958         const struct iphdr *iph;
1959         const struct tcphdr *th;
1960         struct sock *sk;
1961         int ret;
1962         struct net *net = dev_net(skb->dev);
1963
1964         if (skb->pkt_type != PACKET_HOST)
1965                 goto discard_it;
1966
1967         /* Count it even if it's bad */
1968         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1969
1970         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1971                 goto discard_it;
1972
1973         th = tcp_hdr(skb);
1974
1975         if (th->doff < sizeof(struct tcphdr) / 4)
1976                 goto bad_packet;
1977         if (!pskb_may_pull(skb, th->doff * 4))
1978                 goto discard_it;
1979
1980         /* An explanation is required here, I think.
1981          * Packet length and doff are validated by header prediction,
1982          * provided case of th->doff==0 is eliminated.
1983          * So, we defer the checks. */
1984         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1985                 goto bad_packet;
1986
1987         th = tcp_hdr(skb);
1988         iph = ip_hdr(skb);
1989         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1990         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1991                                     skb->len - th->doff * 4);
1992         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1993         TCP_SKB_CB(skb)->when    = 0;
1994         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1995         TCP_SKB_CB(skb)->sacked  = 0;
1996
1997         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1998         if (!sk)
1999                 goto no_tcp_socket;
2000
2001 process:
2002         if (sk->sk_state == TCP_TIME_WAIT)
2003                 goto do_time_wait;
2004
2005         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2006                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
2007                 goto discard_and_relse;
2008         }
2009
2010         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2011                 goto discard_and_relse;
2012         nf_reset(skb);
2013
2014         if (sk_filter(sk, skb))
2015                 goto discard_and_relse;
2016
2017         skb->dev = NULL;
2018
2019         bh_lock_sock_nested(sk);
2020         ret = 0;
2021         if (!sock_owned_by_user(sk)) {
2022 #ifdef CONFIG_NET_DMA
2023                 struct tcp_sock *tp = tcp_sk(sk);
2024                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
2025                         tp->ucopy.dma_chan = net_dma_find_channel();
2026                 if (tp->ucopy.dma_chan)
2027                         ret = tcp_v4_do_rcv(sk, skb);
2028                 else
2029 #endif
2030                 {
2031                         if (!tcp_prequeue(sk, skb))
2032                                 ret = tcp_v4_do_rcv(sk, skb);
2033                 }
2034         } else if (unlikely(sk_add_backlog(sk, skb,
2035                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
2036                 bh_unlock_sock(sk);
2037                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
2038                 goto discard_and_relse;
2039         }
2040         bh_unlock_sock(sk);
2041
2042         sock_put(sk);
2043
2044         return ret;
2045
2046 no_tcp_socket:
2047         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2048                 goto discard_it;
2049
2050         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2051 bad_packet:
2052                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2053         } else {
2054                 tcp_v4_send_reset(NULL, skb);
2055         }
2056
2057 discard_it:
2058         /* Discard frame. */
2059         kfree_skb(skb);
2060         return 0;
2061
2062 discard_and_relse:
2063         sock_put(sk);
2064         goto discard_it;
2065
2066 do_time_wait:
2067         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2068                 inet_twsk_put(inet_twsk(sk));
2069                 goto discard_it;
2070         }
2071
2072         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2073                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2074                 inet_twsk_put(inet_twsk(sk));
2075                 goto discard_it;
2076         }
2077         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2078         case TCP_TW_SYN: {
2079                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2080                                                         &tcp_hashinfo,
2081                                                         iph->saddr, th->source,
2082                                                         iph->daddr, th->dest,
2083                                                         inet_iif(skb));
2084                 if (sk2) {
2085                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
2086                         inet_twsk_put(inet_twsk(sk));
2087                         sk = sk2;
2088                         goto process;
2089                 }
2090                 /* Fall through to ACK */
2091         }
2092         case TCP_TW_ACK:
2093                 tcp_v4_timewait_ack(sk, skb);
2094                 break;
2095         case TCP_TW_RST:
2096                 goto no_tcp_socket;
2097         case TCP_TW_SUCCESS:;
2098         }
2099         goto discard_it;
2100 }
2101
2102 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2103         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2104         .twsk_unique    = tcp_twsk_unique,
2105         .twsk_destructor= tcp_twsk_destructor,
2106 };
2107
2108 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2109 {
2110         struct dst_entry *dst = skb_dst(skb);
2111
2112         dst_hold(dst);
2113         sk->sk_rx_dst = dst;
2114         inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2115 }
2116 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2117
2118 const struct inet_connection_sock_af_ops ipv4_specific = {
2119         .queue_xmit        = ip_queue_xmit,
2120         .send_check        = tcp_v4_send_check,
2121         .rebuild_header    = inet_sk_rebuild_header,
2122         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2123         .conn_request      = tcp_v4_conn_request,
2124         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2125         .net_header_len    = sizeof(struct iphdr),
2126         .setsockopt        = ip_setsockopt,
2127         .getsockopt        = ip_getsockopt,
2128         .addr2sockaddr     = inet_csk_addr2sockaddr,
2129         .sockaddr_len      = sizeof(struct sockaddr_in),
2130         .bind_conflict     = inet_csk_bind_conflict,
2131 #ifdef CONFIG_COMPAT
2132         .compat_setsockopt = compat_ip_setsockopt,
2133         .compat_getsockopt = compat_ip_getsockopt,
2134 #endif
2135 };
2136 EXPORT_SYMBOL(ipv4_specific);
2137
2138 #ifdef CONFIG_TCP_MD5SIG
2139 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2140         .md5_lookup             = tcp_v4_md5_lookup,
2141         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2142         .md5_parse              = tcp_v4_parse_md5_keys,
2143 };
2144 #endif
2145
2146 /* NOTE: A lot of things set to zero explicitly by call to
2147  *       sk_alloc() so need not be done here.
2148  */
2149 static int tcp_v4_init_sock(struct sock *sk)
2150 {
2151         struct inet_connection_sock *icsk = inet_csk(sk);
2152
2153         tcp_init_sock(sk);
2154
2155         icsk->icsk_af_ops = &ipv4_specific;
2156
2157 #ifdef CONFIG_TCP_MD5SIG
2158         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2159 #endif
2160
2161         return 0;
2162 }
2163
2164 void tcp_v4_destroy_sock(struct sock *sk)
2165 {
2166         struct tcp_sock *tp = tcp_sk(sk);
2167
2168         tcp_clear_xmit_timers(sk);
2169
2170         tcp_cleanup_congestion_control(sk);
2171
2172         /* Cleanup up the write buffer. */
2173         tcp_write_queue_purge(sk);
2174
2175         /* Cleans up our, hopefully empty, out_of_order_queue. */
2176         __skb_queue_purge(&tp->out_of_order_queue);
2177
2178 #ifdef CONFIG_TCP_MD5SIG
2179         /* Clean up the MD5 key list, if any */
2180         if (tp->md5sig_info) {
2181                 tcp_clear_md5_list(sk);
2182                 kfree_rcu(tp->md5sig_info, rcu);
2183                 tp->md5sig_info = NULL;
2184         }
2185 #endif
2186
2187 #ifdef CONFIG_NET_DMA
2188         /* Cleans up our sk_async_wait_queue */
2189         __skb_queue_purge(&sk->sk_async_wait_queue);
2190 #endif
2191
2192         /* Clean prequeue, it must be empty really */
2193         __skb_queue_purge(&tp->ucopy.prequeue);
2194
2195         /* Clean up a referenced TCP bind bucket. */
2196         if (inet_csk(sk)->icsk_bind_hash)
2197                 inet_put_port(sk);
2198
2199         /* TCP Cookie Transactions */
2200         if (tp->cookie_values != NULL) {
2201                 kref_put(&tp->cookie_values->kref,
2202                          tcp_cookie_values_release);
2203                 tp->cookie_values = NULL;
2204         }
2205         BUG_ON(tp->fastopen_rsk != NULL);
2206
2207         /* If socket is aborted during connect operation */
2208         tcp_free_fastopen_req(tp);
2209
2210         sk_sockets_allocated_dec(sk);
2211         sock_release_memcg(sk);
2212 }
2213 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2214
2215 #ifdef CONFIG_PROC_FS
2216 /* Proc filesystem TCP sock list dumping. */
2217
2218 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
2219 {
2220         return hlist_nulls_empty(head) ? NULL :
2221                 list_entry(head->first, struct inet_timewait_sock, tw_node);
2222 }
2223
2224 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2225 {
2226         return !is_a_nulls(tw->tw_node.next) ?
2227                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2228 }
2229
2230 /*
2231  * Get next listener socket follow cur.  If cur is NULL, get first socket
2232  * starting from bucket given in st->bucket; when st->bucket is zero the
2233  * very first socket in the hash table is returned.
2234  */
2235 static void *listening_get_next(struct seq_file *seq, void *cur)
2236 {
2237         struct inet_connection_sock *icsk;
2238         struct hlist_nulls_node *node;
2239         struct sock *sk = cur;
2240         struct inet_listen_hashbucket *ilb;
2241         struct tcp_iter_state *st = seq->private;
2242         struct net *net = seq_file_net(seq);
2243
2244         if (!sk) {
2245                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2246                 spin_lock_bh(&ilb->lock);
2247                 sk = sk_nulls_head(&ilb->head);
2248                 st->offset = 0;
2249                 goto get_sk;
2250         }
2251         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2252         ++st->num;
2253         ++st->offset;
2254
2255         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2256                 struct request_sock *req = cur;
2257
2258                 icsk = inet_csk(st->syn_wait_sk);
2259                 req = req->dl_next;
2260                 while (1) {
2261                         while (req) {
2262                                 if (req->rsk_ops->family == st->family) {
2263                                         cur = req;
2264                                         goto out;
2265                                 }
2266                                 req = req->dl_next;
2267                         }
2268                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2269                                 break;
2270 get_req:
2271                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2272                 }
2273                 sk        = sk_nulls_next(st->syn_wait_sk);
2274                 st->state = TCP_SEQ_STATE_LISTENING;
2275                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2276         } else {
2277                 icsk = inet_csk(sk);
2278                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2279                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2280                         goto start_req;
2281                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2282                 sk = sk_nulls_next(sk);
2283         }
2284 get_sk:
2285         sk_nulls_for_each_from(sk, node) {
2286                 if (!net_eq(sock_net(sk), net))
2287                         continue;
2288                 if (sk->sk_family == st->family) {
2289                         cur = sk;
2290                         goto out;
2291                 }
2292                 icsk = inet_csk(sk);
2293                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2294                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2295 start_req:
2296                         st->uid         = sock_i_uid(sk);
2297                         st->syn_wait_sk = sk;
2298                         st->state       = TCP_SEQ_STATE_OPENREQ;
2299                         st->sbucket     = 0;
2300                         goto get_req;
2301                 }
2302                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2303         }
2304         spin_unlock_bh(&ilb->lock);
2305         st->offset = 0;
2306         if (++st->bucket < INET_LHTABLE_SIZE) {
2307                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2308                 spin_lock_bh(&ilb->lock);
2309                 sk = sk_nulls_head(&ilb->head);
2310                 goto get_sk;
2311         }
2312         cur = NULL;
2313 out:
2314         return cur;
2315 }
2316
2317 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2318 {
2319         struct tcp_iter_state *st = seq->private;
2320         void *rc;
2321
2322         st->bucket = 0;
2323         st->offset = 0;
2324         rc = listening_get_next(seq, NULL);
2325
2326         while (rc && *pos) {
2327                 rc = listening_get_next(seq, rc);
2328                 --*pos;
2329         }
2330         return rc;
2331 }
2332
2333 static inline bool empty_bucket(struct tcp_iter_state *st)
2334 {
2335         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2336                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2337 }
2338
2339 /*
2340  * Get first established socket starting from bucket given in st->bucket.
2341  * If st->bucket is zero, the very first socket in the hash is returned.
2342  */
2343 static void *established_get_first(struct seq_file *seq)
2344 {
2345         struct tcp_iter_state *st = seq->private;
2346         struct net *net = seq_file_net(seq);
2347         void *rc = NULL;
2348
2349         st->offset = 0;
2350         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2351                 struct sock *sk;
2352                 struct hlist_nulls_node *node;
2353                 struct inet_timewait_sock *tw;
2354                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2355
2356                 /* Lockless fast path for the common case of empty buckets */
2357                 if (empty_bucket(st))
2358                         continue;
2359
2360                 spin_lock_bh(lock);
2361                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2362                         if (sk->sk_family != st->family ||
2363                             !net_eq(sock_net(sk), net)) {
2364                                 continue;
2365                         }
2366                         rc = sk;
2367                         goto out;
2368                 }
2369                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2370                 inet_twsk_for_each(tw, node,
2371                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2372                         if (tw->tw_family != st->family ||
2373                             !net_eq(twsk_net(tw), net)) {
2374                                 continue;
2375                         }
2376                         rc = tw;
2377                         goto out;
2378                 }
2379                 spin_unlock_bh(lock);
2380                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2381         }
2382 out:
2383         return rc;
2384 }
2385
2386 static void *established_get_next(struct seq_file *seq, void *cur)
2387 {
2388         struct sock *sk = cur;
2389         struct inet_timewait_sock *tw;
2390         struct hlist_nulls_node *node;
2391         struct tcp_iter_state *st = seq->private;
2392         struct net *net = seq_file_net(seq);
2393
2394         ++st->num;
2395         ++st->offset;
2396
2397         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2398                 tw = cur;
2399                 tw = tw_next(tw);
2400 get_tw:
2401                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2402                         tw = tw_next(tw);
2403                 }
2404                 if (tw) {
2405                         cur = tw;
2406                         goto out;
2407                 }
2408                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2409                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2410
2411                 /* Look for next non empty bucket */
2412                 st->offset = 0;
2413                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2414                                 empty_bucket(st))
2415                         ;
2416                 if (st->bucket > tcp_hashinfo.ehash_mask)
2417                         return NULL;
2418
2419                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2420                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2421         } else
2422                 sk = sk_nulls_next(sk);
2423
2424         sk_nulls_for_each_from(sk, node) {
2425                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2426                         goto found;
2427         }
2428
2429         st->state = TCP_SEQ_STATE_TIME_WAIT;
2430         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2431         goto get_tw;
2432 found:
2433         cur = sk;
2434 out:
2435         return cur;
2436 }
2437
2438 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2439 {
2440         struct tcp_iter_state *st = seq->private;
2441         void *rc;
2442
2443         st->bucket = 0;
2444         rc = established_get_first(seq);
2445
2446         while (rc && pos) {
2447                 rc = established_get_next(seq, rc);
2448                 --pos;
2449         }
2450         return rc;
2451 }
2452
2453 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2454 {
2455         void *rc;
2456         struct tcp_iter_state *st = seq->private;
2457
2458         st->state = TCP_SEQ_STATE_LISTENING;
2459         rc        = listening_get_idx(seq, &pos);
2460
2461         if (!rc) {
2462                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2463                 rc        = established_get_idx(seq, pos);
2464         }
2465
2466         return rc;
2467 }
2468
2469 static void *tcp_seek_last_pos(struct seq_file *seq)
2470 {
2471         struct tcp_iter_state *st = seq->private;
2472         int offset = st->offset;
2473         int orig_num = st->num;
2474         void *rc = NULL;
2475
2476         switch (st->state) {
2477         case TCP_SEQ_STATE_OPENREQ:
2478         case TCP_SEQ_STATE_LISTENING:
2479                 if (st->bucket >= INET_LHTABLE_SIZE)
2480                         break;
2481                 st->state = TCP_SEQ_STATE_LISTENING;
2482                 rc = listening_get_next(seq, NULL);
2483                 while (offset-- && rc)
2484                         rc = listening_get_next(seq, rc);
2485                 if (rc)
2486                         break;
2487                 st->bucket = 0;
2488                 /* Fallthrough */
2489         case TCP_SEQ_STATE_ESTABLISHED:
2490         case TCP_SEQ_STATE_TIME_WAIT:
2491                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2492                 if (st->bucket > tcp_hashinfo.ehash_mask)
2493                         break;
2494                 rc = established_get_first(seq);
2495                 while (offset-- && rc)
2496                         rc = established_get_next(seq, rc);
2497         }
2498
2499         st->num = orig_num;
2500
2501         return rc;
2502 }
2503
2504 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2505 {
2506         struct tcp_iter_state *st = seq->private;
2507         void *rc;
2508
2509         if (*pos && *pos == st->last_pos) {
2510                 rc = tcp_seek_last_pos(seq);
2511                 if (rc)
2512                         goto out;
2513         }
2514
2515         st->state = TCP_SEQ_STATE_LISTENING;
2516         st->num = 0;
2517         st->bucket = 0;
2518         st->offset = 0;
2519         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2520
2521 out:
2522         st->last_pos = *pos;
2523         return rc;
2524 }
2525
2526 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2527 {
2528         struct tcp_iter_state *st = seq->private;
2529         void *rc = NULL;
2530
2531         if (v == SEQ_START_TOKEN) {
2532                 rc = tcp_get_idx(seq, 0);
2533                 goto out;
2534         }
2535
2536         switch (st->state) {
2537         case TCP_SEQ_STATE_OPENREQ:
2538         case TCP_SEQ_STATE_LISTENING:
2539                 rc = listening_get_next(seq, v);
2540                 if (!rc) {
2541                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2542                         st->bucket = 0;
2543                         st->offset = 0;
2544                         rc        = established_get_first(seq);
2545                 }
2546                 break;
2547         case TCP_SEQ_STATE_ESTABLISHED:
2548         case TCP_SEQ_STATE_TIME_WAIT:
2549                 rc = established_get_next(seq, v);
2550                 break;
2551         }
2552 out:
2553         ++*pos;
2554         st->last_pos = *pos;
2555         return rc;
2556 }
2557
2558 static void tcp_seq_stop(struct seq_file *seq, void *v)
2559 {
2560         struct tcp_iter_state *st = seq->private;
2561
2562         switch (st->state) {
2563         case TCP_SEQ_STATE_OPENREQ:
2564                 if (v) {
2565                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2566                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2567                 }
2568         case TCP_SEQ_STATE_LISTENING:
2569                 if (v != SEQ_START_TOKEN)
2570                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2571                 break;
2572         case TCP_SEQ_STATE_TIME_WAIT:
2573         case TCP_SEQ_STATE_ESTABLISHED:
2574                 if (v)
2575                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2576                 break;
2577         }
2578 }
2579
2580 int tcp_seq_open(struct inode *inode, struct file *file)
2581 {
2582         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2583         struct tcp_iter_state *s;
2584         int err;
2585
2586         err = seq_open_net(inode, file, &afinfo->seq_ops,
2587                           sizeof(struct tcp_iter_state));
2588         if (err < 0)
2589                 return err;
2590
2591         s = ((struct seq_file *)file->private_data)->private;
2592         s->family               = afinfo->family;
2593         s->last_pos             = 0;
2594         return 0;
2595 }
2596 EXPORT_SYMBOL(tcp_seq_open);
2597
2598 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2599 {
2600         int rc = 0;
2601         struct proc_dir_entry *p;
2602
2603         afinfo->seq_ops.start           = tcp_seq_start;
2604         afinfo->seq_ops.next            = tcp_seq_next;
2605         afinfo->seq_ops.stop            = tcp_seq_stop;
2606
2607         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2608                              afinfo->seq_fops, afinfo);
2609         if (!p)
2610                 rc = -ENOMEM;
2611         return rc;
2612 }
2613 EXPORT_SYMBOL(tcp_proc_register);
2614
2615 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2616 {
2617         proc_net_remove(net, afinfo->name);
2618 }
2619 EXPORT_SYMBOL(tcp_proc_unregister);
2620
2621 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2622                          struct seq_file *f, int i, kuid_t uid, int *len)
2623 {
2624         const struct inet_request_sock *ireq = inet_rsk(req);
2625         long delta = req->expires - jiffies;
2626
2627         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2628                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2629                 i,
2630                 ireq->loc_addr,
2631                 ntohs(inet_sk(sk)->inet_sport),
2632                 ireq->rmt_addr,
2633                 ntohs(ireq->rmt_port),
2634                 TCP_SYN_RECV,
2635                 0, 0, /* could print option size, but that is af dependent. */
2636                 1,    /* timers active (only the expire timer) */
2637                 jiffies_delta_to_clock_t(delta),
2638                 req->num_timeout,
2639                 from_kuid_munged(seq_user_ns(f), uid),
2640                 0,  /* non standard timer */
2641                 0, /* open_requests have no inode */
2642                 atomic_read(&sk->sk_refcnt),
2643                 req,
2644                 len);
2645 }
2646
2647 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2648 {
2649         int timer_active;
2650         unsigned long timer_expires;
2651         const struct tcp_sock *tp = tcp_sk(sk);
2652         const struct inet_connection_sock *icsk = inet_csk(sk);
2653         const struct inet_sock *inet = inet_sk(sk);
2654         struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2655         __be32 dest = inet->inet_daddr;
2656         __be32 src = inet->inet_rcv_saddr;
2657         __u16 destp = ntohs(inet->inet_dport);
2658         __u16 srcp = ntohs(inet->inet_sport);
2659         int rx_queue;
2660
2661         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2662                 timer_active    = 1;
2663                 timer_expires   = icsk->icsk_timeout;
2664         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2665                 timer_active    = 4;
2666                 timer_expires   = icsk->icsk_timeout;
2667         } else if (timer_pending(&sk->sk_timer)) {
2668                 timer_active    = 2;
2669                 timer_expires   = sk->sk_timer.expires;
2670         } else {
2671                 timer_active    = 0;
2672                 timer_expires = jiffies;
2673         }
2674
2675         if (sk->sk_state == TCP_LISTEN)
2676                 rx_queue = sk->sk_ack_backlog;
2677         else
2678                 /*
2679                  * because we dont lock socket, we might find a transient negative value
2680                  */
2681                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2682
2683         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2684                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2685                 i, src, srcp, dest, destp, sk->sk_state,
2686                 tp->write_seq - tp->snd_una,
2687                 rx_queue,
2688                 timer_active,
2689                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2690                 icsk->icsk_retransmits,
2691                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2692                 icsk->icsk_probes_out,
2693                 sock_i_ino(sk),
2694                 atomic_read(&sk->sk_refcnt), sk,
2695                 jiffies_to_clock_t(icsk->icsk_rto),
2696                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2697                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2698                 tp->snd_cwnd,
2699                 sk->sk_state == TCP_LISTEN ?
2700                     (fastopenq ? fastopenq->max_qlen : 0) :
2701                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2702                 len);
2703 }
2704
2705 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2706                                struct seq_file *f, int i, int *len)
2707 {
2708         __be32 dest, src;
2709         __u16 destp, srcp;
2710         long delta = tw->tw_ttd - jiffies;
2711
2712         dest  = tw->tw_daddr;
2713         src   = tw->tw_rcv_saddr;
2714         destp = ntohs(tw->tw_dport);
2715         srcp  = ntohs(tw->tw_sport);
2716
2717         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2718                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2719                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2720                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2721                 atomic_read(&tw->tw_refcnt), tw, len);
2722 }
2723
2724 #define TMPSZ 150
2725
2726 static int tcp4_seq_show(struct seq_file *seq, void *v)
2727 {
2728         struct tcp_iter_state *st;
2729         int len;
2730
2731         if (v == SEQ_START_TOKEN) {
2732                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2733                            "  sl  local_address rem_address   st tx_queue "
2734                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2735                            "inode");
2736                 goto out;
2737         }
2738         st = seq->private;
2739
2740         switch (st->state) {
2741         case TCP_SEQ_STATE_LISTENING:
2742         case TCP_SEQ_STATE_ESTABLISHED:
2743                 get_tcp4_sock(v, seq, st->num, &len);
2744                 break;
2745         case TCP_SEQ_STATE_OPENREQ:
2746                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2747                 break;
2748         case TCP_SEQ_STATE_TIME_WAIT:
2749                 get_timewait4_sock(v, seq, st->num, &len);
2750                 break;
2751         }
2752         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2753 out:
2754         return 0;
2755 }
2756
2757 static const struct file_operations tcp_afinfo_seq_fops = {
2758         .owner   = THIS_MODULE,
2759         .open    = tcp_seq_open,
2760         .read    = seq_read,
2761         .llseek  = seq_lseek,
2762         .release = seq_release_net
2763 };
2764
2765 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2766         .name           = "tcp",
2767         .family         = AF_INET,
2768         .seq_fops       = &tcp_afinfo_seq_fops,
2769         .seq_ops        = {
2770                 .show           = tcp4_seq_show,
2771         },
2772 };
2773
2774 static int __net_init tcp4_proc_init_net(struct net *net)
2775 {
2776         return tcp_proc_register(net, &tcp4_seq_afinfo);
2777 }
2778
2779 static void __net_exit tcp4_proc_exit_net(struct net *net)
2780 {
2781         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2782 }
2783
2784 static struct pernet_operations tcp4_net_ops = {
2785         .init = tcp4_proc_init_net,
2786         .exit = tcp4_proc_exit_net,
2787 };
2788
2789 int __init tcp4_proc_init(void)
2790 {
2791         return register_pernet_subsys(&tcp4_net_ops);
2792 }
2793
2794 void tcp4_proc_exit(void)
2795 {
2796         unregister_pernet_subsys(&tcp4_net_ops);
2797 }
2798 #endif /* CONFIG_PROC_FS */
2799
2800 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2801 {
2802         const struct iphdr *iph = skb_gro_network_header(skb);
2803         __wsum wsum;
2804         __sum16 sum;
2805
2806         switch (skb->ip_summed) {
2807         case CHECKSUM_COMPLETE:
2808                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2809                                   skb->csum)) {
2810                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2811                         break;
2812                 }
2813 flush:
2814                 NAPI_GRO_CB(skb)->flush = 1;
2815                 return NULL;
2816
2817         case CHECKSUM_NONE:
2818                 wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
2819                                           skb_gro_len(skb), IPPROTO_TCP, 0);
2820                 sum = csum_fold(skb_checksum(skb,
2821                                              skb_gro_offset(skb),
2822                                              skb_gro_len(skb),
2823                                              wsum));
2824                 if (sum)
2825                         goto flush;
2826
2827                 skb->ip_summed = CHECKSUM_UNNECESSARY;
2828                 break;
2829         }
2830
2831         return tcp_gro_receive(head, skb);
2832 }
2833
2834 int tcp4_gro_complete(struct sk_buff *skb)
2835 {
2836         const struct iphdr *iph = ip_hdr(skb);
2837         struct tcphdr *th = tcp_hdr(skb);
2838
2839         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2840                                   iph->saddr, iph->daddr, 0);
2841         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2842
2843         return tcp_gro_complete(skb);
2844 }
2845
2846 struct proto tcp_prot = {
2847         .name                   = "TCP",
2848         .owner                  = THIS_MODULE,
2849         .close                  = tcp_close,
2850         .connect                = tcp_v4_connect,
2851         .disconnect             = tcp_disconnect,
2852         .accept                 = inet_csk_accept,
2853         .ioctl                  = tcp_ioctl,
2854         .init                   = tcp_v4_init_sock,
2855         .destroy                = tcp_v4_destroy_sock,
2856         .shutdown               = tcp_shutdown,
2857         .setsockopt             = tcp_setsockopt,
2858         .getsockopt             = tcp_getsockopt,
2859         .recvmsg                = tcp_recvmsg,
2860         .sendmsg                = tcp_sendmsg,
2861         .sendpage               = tcp_sendpage,
2862         .backlog_rcv            = tcp_v4_do_rcv,
2863         .release_cb             = tcp_release_cb,
2864         .mtu_reduced            = tcp_v4_mtu_reduced,
2865         .hash                   = inet_hash,
2866         .unhash                 = inet_unhash,
2867         .get_port               = inet_csk_get_port,
2868         .enter_memory_pressure  = tcp_enter_memory_pressure,
2869         .sockets_allocated      = &tcp_sockets_allocated,
2870         .orphan_count           = &tcp_orphan_count,
2871         .memory_allocated       = &tcp_memory_allocated,
2872         .memory_pressure        = &tcp_memory_pressure,
2873         .sysctl_wmem            = sysctl_tcp_wmem,
2874         .sysctl_rmem            = sysctl_tcp_rmem,
2875         .max_header             = MAX_TCP_HEADER,
2876         .obj_size               = sizeof(struct tcp_sock),
2877         .slab_flags             = SLAB_DESTROY_BY_RCU,
2878         .twsk_prot              = &tcp_timewait_sock_ops,
2879         .rsk_prot               = &tcp_request_sock_ops,
2880         .h.hashinfo             = &tcp_hashinfo,
2881         .no_autobind            = true,
2882 #ifdef CONFIG_COMPAT
2883         .compat_setsockopt      = compat_tcp_setsockopt,
2884         .compat_getsockopt      = compat_tcp_getsockopt,
2885 #endif
2886 #ifdef CONFIG_MEMCG_KMEM
2887         .init_cgroup            = tcp_init_cgroup,
2888         .destroy_cgroup         = tcp_destroy_cgroup,
2889         .proto_cgroup           = tcp_proto_cgroup,
2890 #endif
2891 };
2892 EXPORT_SYMBOL(tcp_prot);
2893
2894 static int __net_init tcp_sk_init(struct net *net)
2895 {
2896         net->ipv4.sysctl_tcp_ecn = 2;
2897         return 0;
2898 }
2899
2900 static void __net_exit tcp_sk_exit(struct net *net)
2901 {
2902 }
2903
2904 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2905 {
2906         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2907 }
2908
2909 static struct pernet_operations __net_initdata tcp_sk_ops = {
2910        .init       = tcp_sk_init,
2911        .exit       = tcp_sk_exit,
2912        .exit_batch = tcp_sk_exit_batch,
2913 };
2914
2915 void __init tcp_v4_init(void)
2916 {
2917         inet_hashinfo_init(&tcp_hashinfo);
2918         if (register_pernet_subsys(&tcp_sk_ops))
2919                 panic("Failed to create the TCP control socket.\n");
2920 }