rhashtable: Round up/down min/max_size to ensure we respect limit
[linux-2.6-block.git] / net / ipv4 / tcp_minisocks.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Mark Evans, <evansmp@uhura.aston.ac.uk>
11 * Corey Minyard <wf-rch!minyard@relay.EU.net>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
14 * Linus Torvalds, <torvalds@cs.helsinki.fi>
15 * Alan Cox, <gw4pts@gw4pts.ampr.org>
16 * Matthew Dillon, <dillon@apollo.west.oic.com>
17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 * Jorge Cwik, <jorge@laser.satlink.net>
19 */
20
1da177e4
LT
21#include <linux/mm.h>
22#include <linux/module.h>
5a0e3ad6 23#include <linux/slab.h>
1da177e4
LT
24#include <linux/sysctl.h>
25#include <linux/workqueue.h>
26#include <net/tcp.h>
27#include <net/inet_common.h>
28#include <net/xfrm.h>
29
e994b7c9 30int sysctl_tcp_syncookies __read_mostly = 1;
c6aefafb
GG
31EXPORT_SYMBOL(sysctl_tcp_syncookies);
32
ab32ea5d 33int sysctl_tcp_abort_on_overflow __read_mostly;
1da177e4 34
295ff7ed
ACM
35struct inet_timewait_death_row tcp_death_row = {
36 .sysctl_max_tw_buckets = NR_FILE * 2,
37 .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
e4d91918 38 .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
295ff7ed
ACM
39 .hashinfo = &tcp_hashinfo,
40 .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
41 (unsigned long)&tcp_death_row),
42 .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work,
65f27f38 43 inet_twdr_twkill_work),
295ff7ed
ACM
44/* Short-time timewait calendar */
45
46 .twcal_hand = -1,
47 .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
48 (unsigned long)&tcp_death_row),
49};
295ff7ed
ACM
50EXPORT_SYMBOL_GPL(tcp_death_row);
51
a2a385d6 52static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
1da177e4
LT
53{
54 if (seq == s_win)
a2a385d6 55 return true;
1da177e4 56 if (after(end_seq, s_win) && before(seq, e_win))
a2a385d6 57 return true;
a02cec21 58 return seq == e_win && seq == end_seq;
1da177e4
LT
59}
60
4fb17a60
NC
61static enum tcp_tw_status
62tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
63 const struct sk_buff *skb, int mib_idx)
64{
65 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
66
67 if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx,
68 &tcptw->tw_last_oow_ack_time)) {
69 /* Send ACK. Note, we do not put the bucket,
70 * it will be released by caller.
71 */
72 return TCP_TW_ACK;
73 }
74
75 /* We are rate-limiting, so just release the tw sock and drop skb. */
76 inet_twsk_put(tw);
77 return TCP_TW_SUCCESS;
78}
79
e905a9ed 80/*
1da177e4
LT
81 * * Main purpose of TIME-WAIT state is to close connection gracefully,
82 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
83 * (and, probably, tail of data) and one or more our ACKs are lost.
84 * * What is TIME-WAIT timeout? It is associated with maximal packet
85 * lifetime in the internet, which results in wrong conclusion, that
86 * it is set to catch "old duplicate segments" wandering out of their path.
87 * It is not quite correct. This timeout is calculated so that it exceeds
88 * maximal retransmission timeout enough to allow to lose one (or more)
89 * segments sent by peer and our ACKs. This time may be calculated from RTO.
90 * * When TIME-WAIT socket receives RST, it means that another end
91 * finally closed and we are allowed to kill TIME-WAIT too.
92 * * Second purpose of TIME-WAIT is catching old duplicate segments.
93 * Well, certainly it is pure paranoia, but if we load TIME-WAIT
94 * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
95 * * If we invented some more clever way to catch duplicates
96 * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
97 *
98 * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
99 * When you compare it to RFCs, please, read section SEGMENT ARRIVES
100 * from the very beginning.
101 *
102 * NOTE. With recycling (and later with fin-wait-2) TW bucket
103 * is _not_ stateless. It means, that strictly speaking we must
104 * spinlock it. I do not want! Well, probability of misbehaviour
105 * is ridiculously low and, seems, we could use some mb() tricks
106 * to avoid misread sequence numbers, states etc. --ANK
4308fc58
AC
107 *
108 * We don't need to initialize tmp_out.sack_ok as we don't use the results
1da177e4
LT
109 */
110enum tcp_tw_status
8feaf0c0
ACM
111tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
112 const struct tcphdr *th)
1da177e4
LT
113{
114 struct tcp_options_received tmp_opt;
4957faad 115 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
a2a385d6 116 bool paws_reject = false;
1da177e4 117
bb5b7c11 118 tmp_opt.saw_tstamp = 0;
8feaf0c0 119 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
1a2c6181 120 tcp_parse_options(skb, &tmp_opt, 0, NULL);
1da177e4
LT
121
122 if (tmp_opt.saw_tstamp) {
ee684b6f 123 tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
8feaf0c0
ACM
124 tmp_opt.ts_recent = tcptw->tw_ts_recent;
125 tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
c887e6d2 126 paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
1da177e4
LT
127 }
128 }
129
130 if (tw->tw_substate == TCP_FIN_WAIT2) {
131 /* Just repeat all the checks of tcp_rcv_state_process() */
132
133 /* Out of window, send ACK */
134 if (paws_reject ||
135 !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
8feaf0c0
ACM
136 tcptw->tw_rcv_nxt,
137 tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
4fb17a60
NC
138 return tcp_timewait_check_oow_rate_limit(
139 tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2);
1da177e4
LT
140
141 if (th->rst)
142 goto kill;
143
8feaf0c0 144 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
1da177e4
LT
145 goto kill_with_rst;
146
147 /* Dup ACK? */
1ac530b3
WY
148 if (!th->ack ||
149 !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
1da177e4 150 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
8feaf0c0 151 inet_twsk_put(tw);
1da177e4
LT
152 return TCP_TW_SUCCESS;
153 }
154
155 /* New data or FIN. If new data arrive after half-duplex close,
156 * reset.
157 */
158 if (!th->fin ||
8feaf0c0 159 TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
1da177e4 160kill_with_rst:
295ff7ed 161 inet_twsk_deschedule(tw, &tcp_death_row);
8feaf0c0 162 inet_twsk_put(tw);
1da177e4
LT
163 return TCP_TW_RST;
164 }
165
166 /* FIN arrived, enter true time-wait state. */
8feaf0c0
ACM
167 tw->tw_substate = TCP_TIME_WAIT;
168 tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1da177e4 169 if (tmp_opt.saw_tstamp) {
9d729f72 170 tcptw->tw_ts_recent_stamp = get_seconds();
8feaf0c0 171 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
1da177e4
LT
172 }
173
ccb7c410
DM
174 if (tcp_death_row.sysctl_tw_recycle &&
175 tcptw->tw_ts_recent_stamp &&
176 tcp_tw_remember_stamp(tw))
696ab2d3
ACM
177 inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
178 TCP_TIMEWAIT_LEN);
1da177e4 179 else
696ab2d3
ACM
180 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
181 TCP_TIMEWAIT_LEN);
1da177e4
LT
182 return TCP_TW_ACK;
183 }
184
185 /*
186 * Now real TIME-WAIT state.
187 *
188 * RFC 1122:
189 * "When a connection is [...] on TIME-WAIT state [...]
190 * [a TCP] MAY accept a new SYN from the remote TCP to
191 * reopen the connection directly, if it:
e905a9ed 192 *
1da177e4
LT
193 * (1) assigns its initial sequence number for the new
194 * connection to be larger than the largest sequence
195 * number it used on the previous connection incarnation,
196 * and
197 *
e905a9ed 198 * (2) returns to TIME-WAIT state if the SYN turns out
1da177e4
LT
199 * to be an old duplicate".
200 */
201
202 if (!paws_reject &&
8feaf0c0 203 (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
1da177e4
LT
204 (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
205 /* In window segment, it may be only reset or bare ack. */
206
207 if (th->rst) {
caa20d9a 208 /* This is TIME_WAIT assassination, in two flavors.
1da177e4
LT
209 * Oh well... nobody has a sufficient solution to this
210 * protocol bug yet.
211 */
212 if (sysctl_tcp_rfc1337 == 0) {
213kill:
295ff7ed 214 inet_twsk_deschedule(tw, &tcp_death_row);
8feaf0c0 215 inet_twsk_put(tw);
1da177e4
LT
216 return TCP_TW_SUCCESS;
217 }
218 }
696ab2d3
ACM
219 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
220 TCP_TIMEWAIT_LEN);
1da177e4
LT
221
222 if (tmp_opt.saw_tstamp) {
8feaf0c0 223 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
9d729f72 224 tcptw->tw_ts_recent_stamp = get_seconds();
1da177e4
LT
225 }
226
8feaf0c0 227 inet_twsk_put(tw);
1da177e4
LT
228 return TCP_TW_SUCCESS;
229 }
230
231 /* Out of window segment.
232
233 All the segments are ACKed immediately.
234
235 The only exception is new SYN. We accept it, if it is
236 not old duplicate and we are not in danger to be killed
237 by delayed old duplicates. RFC check is that it has
238 newer sequence number works at rates <40Mbit/sec.
239 However, if paws works, it is reliable AND even more,
240 we even may relax silly seq space cutoff.
241
242 RED-PEN: we violate main RFC requirement, if this SYN will appear
243 old duplicate (i.e. we receive RST in reply to SYN-ACK),
244 we must return socket to time-wait state. It is not good,
245 but not fatal yet.
246 */
247
248 if (th->syn && !th->rst && !th->ack && !paws_reject &&
8feaf0c0
ACM
249 (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
250 (tmp_opt.saw_tstamp &&
251 (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
252 u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
1da177e4
LT
253 if (isn == 0)
254 isn++;
04317daf 255 TCP_SKB_CB(skb)->tcp_tw_isn = isn;
1da177e4
LT
256 return TCP_TW_SYN;
257 }
258
259 if (paws_reject)
de0744af 260 NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
1da177e4 261
2de979bd 262 if (!th->rst) {
1da177e4
LT
263 /* In this case we must reset the TIMEWAIT timer.
264 *
265 * If it is ACKless SYN it may be both old duplicate
266 * and new good SYN with random sequence number <rcv_nxt.
267 * Do not reschedule in the last case.
268 */
269 if (paws_reject || th->ack)
696ab2d3
ACM
270 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
271 TCP_TIMEWAIT_LEN);
1da177e4 272
4fb17a60
NC
273 return tcp_timewait_check_oow_rate_limit(
274 tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
1da177e4 275 }
8feaf0c0 276 inet_twsk_put(tw);
1da177e4
LT
277 return TCP_TW_SUCCESS;
278}
4bc2f18b 279EXPORT_SYMBOL(tcp_timewait_state_process);
1da177e4 280
e905a9ed 281/*
1da177e4 282 * Move a socket to time-wait or dead fin-wait-2 state.
e905a9ed 283 */
1da177e4
LT
284void tcp_time_wait(struct sock *sk, int state, int timeo)
285{
8feaf0c0 286 struct inet_timewait_sock *tw = NULL;
8292a17a 287 const struct inet_connection_sock *icsk = inet_csk(sk);
8feaf0c0 288 const struct tcp_sock *tp = tcp_sk(sk);
a2a385d6 289 bool recycle_ok = false;
1da177e4 290
b6242b9b 291 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
3f419d2d 292 recycle_ok = tcp_remember_stamp(sk);
1da177e4 293
295ff7ed 294 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
c676270b 295 tw = inet_twsk_alloc(sk, state);
1da177e4 296
8feaf0c0
ACM
297 if (tw != NULL) {
298 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
463c84b9 299 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
2397849b 300 struct inet_sock *inet = inet_sk(sk);
8feaf0c0 301
2397849b 302 tw->tw_transparent = inet->transparent;
1da177e4 303 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
8feaf0c0
ACM
304 tcptw->tw_rcv_nxt = tp->rcv_nxt;
305 tcptw->tw_snd_nxt = tp->snd_nxt;
306 tcptw->tw_rcv_wnd = tcp_receive_window(tp);
307 tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
308 tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
ceaa1fef 309 tcptw->tw_ts_offset = tp->tsoffset;
4fb17a60 310 tcptw->tw_last_oow_ack_time = 0;
1da177e4 311
dfd56b8b 312#if IS_ENABLED(CONFIG_IPV6)
1da177e4
LT
313 if (tw->tw_family == PF_INET6) {
314 struct ipv6_pinfo *np = inet6_sk(sk);
315
efe4208f
ED
316 tw->tw_v6_daddr = sk->sk_v6_daddr;
317 tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
b903d324 318 tw->tw_tclass = np->tclass;
1d13a96c 319 tw->tw_flowlabel = np->flow_label >> 12;
9fe516ba 320 tw->tw_ipv6only = sk->sk_ipv6only;
c676270b 321 }
1da177e4 322#endif
cfb6eeb4
YH
323
324#ifdef CONFIG_TCP_MD5SIG
325 /*
326 * The timewait bucket does not have the key DB from the
327 * sock structure. We just make a quick copy of the
328 * md5 key being used (if indeed we are using one)
329 * so the timewait ack generating code has the key.
330 */
331 do {
332 struct tcp_md5sig_key *key;
a915da9b 333 tcptw->tw_md5_key = NULL;
cfb6eeb4
YH
334 key = tp->af_specific->md5_lookup(sk, sk);
335 if (key != NULL) {
a915da9b 336 tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
71cea17e 337 if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool())
cfb6eeb4
YH
338 BUG();
339 }
2de979bd 340 } while (0);
cfb6eeb4
YH
341#endif
342
1da177e4 343 /* Linkage updates. */
e48c414e 344 __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
1da177e4
LT
345
346 /* Get the TIME_WAIT timeout firing. */
347 if (timeo < rto)
348 timeo = rto;
349
350 if (recycle_ok) {
351 tw->tw_timeout = rto;
352 } else {
353 tw->tw_timeout = TCP_TIMEWAIT_LEN;
354 if (state == TCP_TIME_WAIT)
355 timeo = TCP_TIMEWAIT_LEN;
356 }
357
696ab2d3
ACM
358 inet_twsk_schedule(tw, &tcp_death_row, timeo,
359 TCP_TIMEWAIT_LEN);
8feaf0c0 360 inet_twsk_put(tw);
1da177e4
LT
361 } else {
362 /* Sorry, if we're out of memory, just CLOSE this
363 * socket up. We've got bigger problems than
364 * non-graceful socket closings.
365 */
67631510 366 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
1da177e4
LT
367 }
368
369 tcp_update_metrics(sk);
370 tcp_done(sk);
371}
372
cfb6eeb4
YH
373void tcp_twsk_destructor(struct sock *sk)
374{
b6242b9b 375#ifdef CONFIG_TCP_MD5SIG
a928630a 376 struct tcp_timewait_sock *twsk = tcp_twsk(sk);
2397849b 377
71cea17e 378 if (twsk->tw_md5_key)
a915da9b 379 kfree_rcu(twsk->tw_md5_key, rcu);
cfb6eeb4
YH
380#endif
381}
cfb6eeb4
YH
382EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
383
843f4a55
YC
384void tcp_openreq_init_rwin(struct request_sock *req,
385 struct sock *sk, struct dst_entry *dst)
386{
387 struct inet_request_sock *ireq = inet_rsk(req);
388 struct tcp_sock *tp = tcp_sk(sk);
389 __u8 rcv_wscale;
390 int mss = dst_metric_advmss(dst);
391
392 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
393 mss = tp->rx_opt.user_mss;
394
395 /* Set this up on the first call only */
396 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
397
398 /* limit the window selection if the user enforce a smaller rx buffer */
399 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
400 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
401 req->window_clamp = tcp_full_space(sk);
402
403 /* tcp_full_space because it is guaranteed to be the first packet */
404 tcp_select_initial_window(tcp_full_space(sk),
405 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
406 &req->rcv_wnd,
407 &req->window_clamp,
408 ireq->wscale_ok,
409 &rcv_wscale,
410 dst_metric(dst, RTAX_INITRWND));
411 ireq->rcv_wscale = rcv_wscale;
412}
413EXPORT_SYMBOL(tcp_openreq_init_rwin);
414
735d3831
FW
415static void tcp_ecn_openreq_child(struct tcp_sock *tp,
416 const struct request_sock *req)
bdf1ee5d
IJ
417{
418 tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
419}
420
81164413
DB
421void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
422{
423 struct inet_connection_sock *icsk = inet_csk(sk);
424 u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
425 bool ca_got_dst = false;
426
427 if (ca_key != TCP_CA_UNSPEC) {
428 const struct tcp_congestion_ops *ca;
429
430 rcu_read_lock();
431 ca = tcp_ca_find_key(ca_key);
432 if (likely(ca && try_module_get(ca->owner))) {
433 icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
434 icsk->icsk_ca_ops = ca;
435 ca_got_dst = true;
436 }
437 rcu_read_unlock();
438 }
439
440 if (!ca_got_dst && !try_module_get(icsk->icsk_ca_ops->owner))
441 tcp_assign_congestion_control(sk);
442
443 tcp_set_ca_state(sk, TCP_CA_Open);
444}
445EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
446
1da177e4
LT
447/* This is not only more efficient than what we used to do, it eliminates
448 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
449 *
450 * Actually, we could lots of memory writes here. tp of listening
451 * socket contains all necessary default parameters.
452 */
60236fdd 453struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
1da177e4 454{
e56c57d0 455 struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
1da177e4 456
87d11ceb 457 if (newsk != NULL) {
9f1d2604 458 const struct inet_request_sock *ireq = inet_rsk(req);
2e6599cb 459 struct tcp_request_sock *treq = tcp_rsk(req);
a9948a7e 460 struct inet_connection_sock *newicsk = inet_csk(newsk);
435cf559 461 struct tcp_sock *newtp = tcp_sk(newsk);
1da177e4 462
1da177e4 463 /* Now setup tcp_sock */
1da177e4 464 newtp->pred_flags = 0;
435cf559
WAS
465
466 newtp->rcv_wup = newtp->copied_seq =
467 newtp->rcv_nxt = treq->rcv_isn + 1;
468
469 newtp->snd_sml = newtp->snd_una =
1a2c6181 470 newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
1da177e4
LT
471
472 tcp_prequeue_init(newtp);
46d3ceab 473 INIT_LIST_HEAD(&newtp->tsq_node);
1da177e4 474
ee7537b6 475 tcp_init_wl(newtp, treq->rcv_isn);
1da177e4 476
740b0f18
ED
477 newtp->srtt_us = 0;
478 newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
463c84b9 479 newicsk->icsk_rto = TCP_TIMEOUT_INIT;
1da177e4
LT
480
481 newtp->packets_out = 0;
1da177e4
LT
482 newtp->retrans_out = 0;
483 newtp->sacked_out = 0;
484 newtp->fackets_out = 0;
0b6a05c1 485 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
eed530b6 486 tcp_enable_early_retrans(newtp);
9b717a8d 487 newtp->tlp_high_seq = 0;
375fe02c 488 newtp->lsndtime = treq->snt_synack;
f2b2c582 489 newtp->last_oow_ack_time = 0;
375fe02c 490 newtp->total_retrans = req->num_retrans;
1da177e4
LT
491
492 /* So many TCP implementations out there (incorrectly) count the
493 * initial SYN frame in their delayed-ACK and congestion control
494 * algorithms that we must have the following bandaid to talk
495 * efficiently to them. -DaveM
496 */
9ad7c049 497 newtp->snd_cwnd = TCP_INIT_CWND;
1da177e4
LT
498 newtp->snd_cwnd_cnt = 0;
499
1da177e4 500 tcp_init_xmit_timers(newsk);
996b175e 501 __skb_queue_head_init(&newtp->out_of_order_queue);
1a2c6181 502 newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
1da177e4
LT
503
504 newtp->rx_opt.saw_tstamp = 0;
505
506 newtp->rx_opt.dsack = 0;
1da177e4 507 newtp->rx_opt.num_sacks = 0;
cabeccbd 508
1da177e4 509 newtp->urg_data = 0;
1da177e4 510
1da177e4 511 if (sock_flag(newsk, SOCK_KEEPOPEN))
463c84b9
ACM
512 inet_csk_reset_keepalive_timer(newsk,
513 keepalive_time_when(newtp));
1da177e4 514
2e6599cb 515 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
2de979bd 516 if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
1da177e4 517 if (sysctl_tcp_fack)
e60402d0 518 tcp_enable_fack(newtp);
1da177e4
LT
519 }
520 newtp->window_clamp = req->window_clamp;
521 newtp->rcv_ssthresh = req->rcv_wnd;
522 newtp->rcv_wnd = req->rcv_wnd;
2e6599cb 523 newtp->rx_opt.wscale_ok = ireq->wscale_ok;
1da177e4 524 if (newtp->rx_opt.wscale_ok) {
2e6599cb
ACM
525 newtp->rx_opt.snd_wscale = ireq->snd_wscale;
526 newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
1da177e4
LT
527 } else {
528 newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
529 newtp->window_clamp = min(newtp->window_clamp, 65535U);
530 }
aa8223c7
ACM
531 newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) <<
532 newtp->rx_opt.snd_wscale);
1da177e4
LT
533 newtp->max_window = newtp->snd_wnd;
534
535 if (newtp->rx_opt.tstamp_ok) {
536 newtp->rx_opt.ts_recent = req->ts_recent;
9d729f72 537 newtp->rx_opt.ts_recent_stamp = get_seconds();
1da177e4
LT
538 newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
539 } else {
540 newtp->rx_opt.ts_recent_stamp = 0;
541 newtp->tcp_header_len = sizeof(struct tcphdr);
542 }
ceaa1fef 543 newtp->tsoffset = 0;
cfb6eeb4
YH
544#ifdef CONFIG_TCP_MD5SIG
545 newtp->md5sig_info = NULL; /*XXX*/
546 if (newtp->af_specific->md5_lookup(sk, newsk))
547 newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
548#endif
bee7ca9e 549 if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
463c84b9 550 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
1da177e4 551 newtp->rx_opt.mss_clamp = req->mss;
735d3831 552 tcp_ecn_openreq_child(newtp, req);
8336886f 553 newtp->fastopen_rsk = NULL;
6f73601e 554 newtp->syn_data_acked = 0;
1da177e4 555
63231bdd 556 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
1da177e4
LT
557 }
558 return newsk;
559}
4bc2f18b 560EXPORT_SYMBOL(tcp_create_openreq_child);
1da177e4 561
e905a9ed 562/*
8336886f
JC
563 * Process an incoming packet for SYN_RECV sockets represented as a
564 * request_sock. Normally sk is the listener socket but for TFO it
565 * points to the child socket.
566 *
567 * XXX (TFO) - The current impl contains a special check for ack
568 * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
4308fc58
AC
569 *
570 * We don't need to initialize tmp_opt.sack_ok as we don't use the results
1da177e4
LT
571 */
572
5a5f3a8d 573struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
60236fdd 574 struct request_sock *req,
8336886f
JC
575 struct request_sock **prev,
576 bool fastopen)
1da177e4 577{
4957faad 578 struct tcp_options_received tmp_opt;
4957faad 579 struct sock *child;
aa8223c7 580 const struct tcphdr *th = tcp_hdr(skb);
714e85be 581 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
a2a385d6 582 bool paws_reject = false;
1da177e4 583
8336886f
JC
584 BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
585
bb5b7c11
DM
586 tmp_opt.saw_tstamp = 0;
587 if (th->doff > (sizeof(struct tcphdr)>>2)) {
1a2c6181 588 tcp_parse_options(skb, &tmp_opt, 0, NULL);
1da177e4
LT
589
590 if (tmp_opt.saw_tstamp) {
591 tmp_opt.ts_recent = req->ts_recent;
592 /* We do not store true stamp, but it is not required,
593 * it can be estimated (approximately)
594 * from another data.
595 */
e6c022a4 596 tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
c887e6d2 597 paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
1da177e4
LT
598 }
599 }
600
601 /* Check for pure retransmitted SYN. */
2e6599cb 602 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
1da177e4
LT
603 flg == TCP_FLAG_SYN &&
604 !paws_reject) {
605 /*
606 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
607 * this case on figure 6 and figure 8, but formal
608 * protocol description says NOTHING.
609 * To be more exact, it says that we should send ACK,
610 * because this segment (at least, if it has no data)
611 * is out of window.
612 *
613 * CONCLUSION: RFC793 (even with RFC1122) DOES NOT
614 * describe SYN-RECV state. All the description
615 * is wrong, we cannot believe to it and should
616 * rely only on common sense and implementation
617 * experience.
618 *
619 * Enforce "SYN-ACK" according to figure 8, figure 6
620 * of RFC793, fixed by RFC1122.
8336886f
JC
621 *
622 * Note that even if there is new data in the SYN packet
623 * they will be thrown away too.
cd75eff6
YC
624 *
625 * Reset timer after retransmitting SYNACK, similar to
626 * the idea of fast retransmit in recovery.
1da177e4 627 */
a9b2c06d
NC
628 if (!tcp_oow_rate_limited(sock_net(sk), skb,
629 LINUX_MIB_TCPACKSKIPPEDSYNRECV,
630 &tcp_rsk(req)->last_oow_ack_time) &&
631
632 !inet_rtx_syn_ack(sk, req))
cd75eff6
YC
633 req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,
634 TCP_RTO_MAX) + jiffies;
1da177e4
LT
635 return NULL;
636 }
637
638 /* Further reproduces section "SEGMENT ARRIVES"
639 for state SYN-RECEIVED of RFC793.
640 It is broken, however, it does not work only
641 when SYNs are crossed.
642
643 You would think that SYN crossing is impossible here, since
644 we should have a SYN_SENT socket (from connect()) on our end,
645 but this is not true if the crossed SYNs were sent to both
646 ends by a malicious third party. We must defend against this,
647 and to do that we first verify the ACK (as per RFC793, page
648 36) and reset if it is invalid. Is this a true full defense?
649 To convince ourselves, let us consider a way in which the ACK
650 test can still pass in this 'malicious crossed SYNs' case.
651 Malicious sender sends identical SYNs (and thus identical sequence
652 numbers) to both A and B:
653
654 A: gets SYN, seq=7
655 B: gets SYN, seq=7
656
657 By our good fortune, both A and B select the same initial
658 send sequence number of seven :-)
659
660 A: sends SYN|ACK, seq=7, ack_seq=8
661 B: sends SYN|ACK, seq=7, ack_seq=8
662
663 So we are now A eating this SYN|ACK, ACK test passes. So
664 does sequence test, SYN is truncated, and thus we consider
665 it a bare ACK.
666
ec0a1966
DM
667 If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
668 bare ACK. Otherwise, we create an established connection. Both
669 ends (listening sockets) accept the new incoming connection and try
670 to talk to each other. 8-)
1da177e4
LT
671
672 Note: This case is both harmless, and rare. Possibility is about the
673 same as us discovering intelligent life on another plant tomorrow.
674
675 But generally, we should (RFC lies!) to accept ACK
676 from SYNACK both here and in tcp_rcv_state_process().
677 tcp_rcv_state_process() does not, hence, we do not too.
678
679 Note that the case is absolutely generic:
680 we cannot optimize anything here without
681 violating protocol. All the checks must be made
682 before attempt to create socket.
683 */
684
685 /* RFC793 page 36: "If the connection is in any non-synchronized state ...
686 * and the incoming segment acknowledges something not yet
caa20d9a 687 * sent (the segment carries an unacceptable ACK) ...
1da177e4
LT
688 * a reset is sent."
689 *
8336886f
JC
690 * Invalid ACK: reset will be sent by listening socket.
691 * Note that the ACK validity check for a Fast Open socket is done
692 * elsewhere and is checked directly against the child socket rather
693 * than req because user data may have been sent out.
1da177e4 694 */
8336886f 695 if ((flg & TCP_FLAG_ACK) && !fastopen &&
435cf559 696 (TCP_SKB_CB(skb)->ack_seq !=
1a2c6181 697 tcp_rsk(req)->snt_isn + 1))
1da177e4
LT
698 return sk;
699
700 /* Also, it would be not so bad idea to check rcv_tsecr, which
701 * is essentially ACK extension and too early or too late values
702 * should cause reset in unsynchronized states.
703 */
704
705 /* RFC793: "first check sequence number". */
706
707 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
8336886f 708 tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
1da177e4
LT
709 /* Out of window: send ACK and drop. */
710 if (!(flg & TCP_FLAG_RST))
6edafaaf 711 req->rsk_ops->send_ack(sk, skb, req);
1da177e4 712 if (paws_reject)
de0744af 713 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
1da177e4
LT
714 return NULL;
715 }
716
717 /* In sequence, PAWS is OK. */
718
8336886f 719 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
2aaab9a0 720 req->ts_recent = tmp_opt.rcv_tsval;
1da177e4 721
2aaab9a0
AL
722 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
723 /* Truncate SYN, it is out of window starting
724 at tcp_rsk(req)->rcv_isn + 1. */
725 flg &= ~TCP_FLAG_SYN;
726 }
1da177e4 727
2aaab9a0
AL
728 /* RFC793: "second check the RST bit" and
729 * "fourth, check the SYN bit"
730 */
731 if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
732 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
733 goto embryonic_reset;
734 }
1da177e4 735
2aaab9a0
AL
736 /* ACK sequence verified above, just make sure ACK is
737 * set. If ACK not set, just silently drop the packet.
8336886f
JC
738 *
739 * XXX (TFO) - if we ever allow "data after SYN", the
740 * following check needs to be removed.
2aaab9a0
AL
741 */
742 if (!(flg & TCP_FLAG_ACK))
743 return NULL;
ec0a1966 744
8336886f
JC
745 /* For Fast Open no more processing is needed (sk is the
746 * child socket).
747 */
748 if (fastopen)
749 return sk;
750
d1b99ba4 751 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
e6c022a4 752 if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
2aaab9a0
AL
753 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
754 inet_rsk(req)->acked = 1;
907cdda5 755 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
2aaab9a0
AL
756 return NULL;
757 }
758
759 /* OK, ACK is valid, create big socket and
760 * feed this segment to it. It will repeat all
761 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
762 * ESTABLISHED STATE. If it will be dropped after
763 * socket is created, wait for troubles.
764 */
765 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
766 if (child == NULL)
767 goto listen_overflow;
1da177e4 768
2aaab9a0
AL
769 inet_csk_reqsk_queue_unlink(sk, req, prev);
770 inet_csk_reqsk_queue_removed(sk, req);
1da177e4 771
2aaab9a0
AL
772 inet_csk_reqsk_queue_add(sk, req, child);
773 return child;
1da177e4 774
2aaab9a0
AL
775listen_overflow:
776 if (!sysctl_tcp_abort_on_overflow) {
777 inet_rsk(req)->acked = 1;
778 return NULL;
779 }
1da177e4 780
2aaab9a0 781embryonic_reset:
8336886f
JC
782 if (!(flg & TCP_FLAG_RST)) {
783 /* Received a bad SYN pkt - for TFO We try not to reset
784 * the local connection unless it's really necessary to
785 * avoid becoming vulnerable to outside attack aiming at
786 * resetting legit local connections.
787 */
2aaab9a0 788 req->rsk_ops->send_reset(sk, skb);
8336886f
JC
789 } else if (fastopen) { /* received a valid RST pkt */
790 reqsk_fastopen_remove(sk, req, true);
791 tcp_reset(sk);
792 }
793 if (!fastopen) {
794 inet_csk_reqsk_queue_drop(sk, req, prev);
795 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
796 }
2aaab9a0 797 return NULL;
1da177e4 798}
4bc2f18b 799EXPORT_SYMBOL(tcp_check_req);
1da177e4
LT
800
801/*
802 * Queue segment on the new socket if the new socket is active,
803 * otherwise we just shortcircuit this and continue with
804 * the new socket.
8336886f
JC
805 *
806 * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
807 * when entering. But other states are possible due to a race condition
808 * where after __inet_lookup_established() fails but before the listener
809 * locked is obtained, other packets cause the same connection to
810 * be created.
1da177e4
LT
811 */
812
813int tcp_child_process(struct sock *parent, struct sock *child,
814 struct sk_buff *skb)
815{
816 int ret = 0;
817 int state = child->sk_state;
818
819 if (!sock_owned_by_user(child)) {
aa8223c7
ACM
820 ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
821 skb->len);
1da177e4
LT
822 /* Wakeup parent, send SIGIO */
823 if (state == TCP_SYN_RECV && child->sk_state != state)
676d2369 824 parent->sk_data_ready(parent);
1da177e4
LT
825 } else {
826 /* Alas, it is possible again, because we do lookup
827 * in main socket hash table and lock on listening
828 * socket does not protect us more.
829 */
a3a858ff 830 __sk_add_backlog(child, skb);
1da177e4
LT
831 }
832
833 bh_unlock_sock(child);
834 sock_put(child);
835 return ret;
836}
1da177e4 837EXPORT_SYMBOL(tcp_child_process);