inet: Remove unused send_check length argument
[linux-2.6-block.git] / net / ipv4 / tcp_output.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Mark Evans, <evansmp@uhura.aston.ac.uk>
11 * Corey Minyard <wf-rch!minyard@relay.EU.net>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
14 * Linus Torvalds, <torvalds@cs.helsinki.fi>
15 * Alan Cox, <gw4pts@gw4pts.ampr.org>
16 * Matthew Dillon, <dillon@apollo.west.oic.com>
17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 * Jorge Cwik, <jorge@laser.satlink.net>
19 */
20
21/*
22 * Changes: Pedro Roque : Retransmit queue handled by TCP.
23 * : Fragmentation on mtu decrease
24 * : Segment collapse on retransmit
25 * : AF independence
26 *
27 * Linus Torvalds : send_delayed_ack
28 * David S. Miller : Charge memory using the right skb
29 * during syn/ack processing.
30 * David S. Miller : Output engine completely rewritten.
31 * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
32 * Cacophonix Gaul : draft-minshall-nagle-01
33 * J Hadi Salim : ECN support
34 *
35 */
36
37#include <net/tcp.h>
38
39#include <linux/compiler.h>
5a0e3ad6 40#include <linux/gfp.h>
1da177e4 41#include <linux/module.h>
1da177e4
LT
42
43/* People can turn this off for buggy TCP's found in printers etc. */
ab32ea5d 44int sysctl_tcp_retrans_collapse __read_mostly = 1;
1da177e4 45
09cb105e 46/* People can turn this on to work with those rare, broken TCPs that
15d99e02
RJ
47 * interpret the window field as a signed quantity.
48 */
ab32ea5d 49int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
15d99e02 50
1da177e4
LT
51/* This limits the percentage of the congestion window which we
52 * will allow a single TSO frame to consume. Building TSO frames
53 * which are too large can cause TCP streams to be bursty.
54 */
ab32ea5d 55int sysctl_tcp_tso_win_divisor __read_mostly = 3;
1da177e4 56
ab32ea5d
BH
57int sysctl_tcp_mtu_probing __read_mostly = 0;
58int sysctl_tcp_base_mss __read_mostly = 512;
5d424d5a 59
35089bb2 60/* By default, RFC2861 behavior. */
ab32ea5d 61int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
35089bb2 62
519855c5 63int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
e6b09cca 64EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
519855c5
WAS
65
66
67edfef7 67/* Account for new data that has been sent to the network. */
66f5fe62 68static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
1da177e4 69{
9e412ba7 70 struct tcp_sock *tp = tcp_sk(sk);
66f5fe62 71 unsigned int prior_packets = tp->packets_out;
9e412ba7 72
fe067e8a 73 tcp_advance_send_head(sk, skb);
1da177e4 74 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
8512430e
IJ
75
76 /* Don't override Nagle indefinately with F-RTO */
77 if (tp->frto_counter == 2)
78 tp->frto_counter = 3;
66f5fe62
IJ
79
80 tp->packets_out += tcp_skb_pcount(skb);
81 if (!prior_packets)
82 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
83 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
1da177e4
LT
84}
85
86/* SND.NXT, if window was not shrunk.
87 * If window has been shrunk, what should we make? It is not clear at all.
88 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
89 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
90 * invalid. OK, let's make this for now:
91 */
9e412ba7 92static inline __u32 tcp_acceptable_seq(struct sock *sk)
1da177e4 93{
9e412ba7
IJ
94 struct tcp_sock *tp = tcp_sk(sk);
95
90840def 96 if (!before(tcp_wnd_end(tp), tp->snd_nxt))
1da177e4
LT
97 return tp->snd_nxt;
98 else
90840def 99 return tcp_wnd_end(tp);
1da177e4
LT
100}
101
102/* Calculate mss to advertise in SYN segment.
103 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
104 *
105 * 1. It is independent of path mtu.
106 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
107 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
108 * attached devices, because some buggy hosts are confused by
109 * large MSS.
110 * 4. We do not make 3, we advertise MSS, calculated from first
111 * hop device mtu, but allow to raise it to ip_rt_min_advmss.
112 * This may be overridden via information stored in routing table.
113 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
114 * probably even Jumbo".
115 */
116static __u16 tcp_advertise_mss(struct sock *sk)
117{
118 struct tcp_sock *tp = tcp_sk(sk);
119 struct dst_entry *dst = __sk_dst_get(sk);
120 int mss = tp->advmss;
121
122 if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
123 mss = dst_metric(dst, RTAX_ADVMSS);
124 tp->advmss = mss;
125 }
126
127 return (__u16)mss;
128}
129
130/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
131 * This is the first part of cwnd validation mechanism. */
463c84b9 132static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
1da177e4 133{
463c84b9 134 struct tcp_sock *tp = tcp_sk(sk);
1da177e4
LT
135 s32 delta = tcp_time_stamp - tp->lsndtime;
136 u32 restart_cwnd = tcp_init_cwnd(tp, dst);
137 u32 cwnd = tp->snd_cwnd;
138
6687e988 139 tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
1da177e4 140
6687e988 141 tp->snd_ssthresh = tcp_current_ssthresh(sk);
1da177e4
LT
142 restart_cwnd = min(restart_cwnd, cwnd);
143
463c84b9 144 while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
1da177e4
LT
145 cwnd >>= 1;
146 tp->snd_cwnd = max(cwnd, restart_cwnd);
147 tp->snd_cwnd_stamp = tcp_time_stamp;
148 tp->snd_cwnd_used = 0;
149}
150
67edfef7 151/* Congestion state accounting after a packet has been sent. */
40efc6fa
SH
152static void tcp_event_data_sent(struct tcp_sock *tp,
153 struct sk_buff *skb, struct sock *sk)
1da177e4 154{
463c84b9
ACM
155 struct inet_connection_sock *icsk = inet_csk(sk);
156 const u32 now = tcp_time_stamp;
1da177e4 157
35089bb2
DM
158 if (sysctl_tcp_slow_start_after_idle &&
159 (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
463c84b9 160 tcp_cwnd_restart(sk, __sk_dst_get(sk));
1da177e4
LT
161
162 tp->lsndtime = now;
163
164 /* If it is a reply for ato after last received
165 * packet, enter pingpong mode.
166 */
463c84b9
ACM
167 if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
168 icsk->icsk_ack.pingpong = 1;
1da177e4
LT
169}
170
67edfef7 171/* Account for an ACK we sent. */
40efc6fa 172static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
1da177e4 173{
463c84b9
ACM
174 tcp_dec_quickack_mode(sk, pkts);
175 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
1da177e4
LT
176}
177
178/* Determine a window scaling and initial window to offer.
179 * Based on the assumption that the given amount of space
180 * will be offered. Store the results in the tp structure.
181 * NOTE: for smooth operation initial space offering should
182 * be a multiple of mss if possible. We assume here that mss >= 1.
183 * This MUST be enforced by all callers.
184 */
185void tcp_select_initial_window(int __space, __u32 mss,
186 __u32 *rcv_wnd, __u32 *window_clamp,
31d12926 187 int wscale_ok, __u8 *rcv_wscale,
188 __u32 init_rcv_wnd)
1da177e4
LT
189{
190 unsigned int space = (__space < 0 ? 0 : __space);
191
192 /* If no clamp set the clamp to the max possible scaled window */
193 if (*window_clamp == 0)
194 (*window_clamp) = (65535 << 14);
195 space = min(*window_clamp, space);
196
197 /* Quantize space offering to a multiple of mss if possible. */
198 if (space > mss)
199 space = (space / mss) * mss;
200
201 /* NOTE: offering an initial window larger than 32767
15d99e02
RJ
202 * will break some buggy TCP stacks. If the admin tells us
203 * it is likely we could be speaking with such a buggy stack
204 * we will truncate our initial window offering to 32K-1
205 * unless the remote has sent us a window scaling option,
206 * which we interpret as a sign the remote TCP is not
207 * misinterpreting the window field as a signed quantity.
1da177e4 208 */
15d99e02
RJ
209 if (sysctl_tcp_workaround_signed_windows)
210 (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
211 else
212 (*rcv_wnd) = space;
213
1da177e4
LT
214 (*rcv_wscale) = 0;
215 if (wscale_ok) {
216 /* Set window scaling on max possible window
e905a9ed 217 * See RFC1323 for an explanation of the limit to 14
1da177e4
LT
218 */
219 space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
316c1592 220 space = min_t(u32, space, *window_clamp);
1da177e4
LT
221 while (space > 65535 && (*rcv_wscale) < 14) {
222 space >>= 1;
223 (*rcv_wscale)++;
224 }
225 }
226
227 /* Set initial window to value enough for senders,
6b251858 228 * following RFC2414. Senders, not following this RFC,
1da177e4
LT
229 * will be satisfied with 2.
230 */
056834d9 231 if (mss > (1 << *rcv_wscale)) {
01ff367e 232 int init_cwnd = 4;
056834d9 233 if (mss > 1460 * 3)
1da177e4 234 init_cwnd = 2;
01ff367e
DM
235 else if (mss > 1460)
236 init_cwnd = 3;
31d12926 237 /* when initializing use the value from init_rcv_wnd
238 * rather than the default from above
239 */
240 if (init_rcv_wnd &&
241 (*rcv_wnd > init_rcv_wnd * mss))
242 *rcv_wnd = init_rcv_wnd * mss;
243 else if (*rcv_wnd > init_cwnd * mss)
056834d9 244 *rcv_wnd = init_cwnd * mss;
1da177e4
LT
245 }
246
247 /* Set the clamp no higher than max representable value */
248 (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
249}
250
251/* Chose a new window to advertise, update state in tcp_sock for the
252 * socket, and return result with RFC1323 scaling applied. The return
253 * value can be stuffed directly into th->window for an outgoing
254 * frame.
255 */
40efc6fa 256static u16 tcp_select_window(struct sock *sk)
1da177e4
LT
257{
258 struct tcp_sock *tp = tcp_sk(sk);
259 u32 cur_win = tcp_receive_window(tp);
260 u32 new_win = __tcp_select_window(sk);
261
262 /* Never shrink the offered window */
2de979bd 263 if (new_win < cur_win) {
1da177e4
LT
264 /* Danger Will Robinson!
265 * Don't update rcv_wup/rcv_wnd here or else
266 * we will not be able to advertise a zero
267 * window in time. --DaveM
268 *
269 * Relax Will Robinson.
270 */
607bfbf2 271 new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
1da177e4
LT
272 }
273 tp->rcv_wnd = new_win;
274 tp->rcv_wup = tp->rcv_nxt;
275
276 /* Make sure we do not exceed the maximum possible
277 * scaled window.
278 */
15d99e02 279 if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
1da177e4
LT
280 new_win = min(new_win, MAX_TCP_WINDOW);
281 else
282 new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
283
284 /* RFC1323 scaling applied */
285 new_win >>= tp->rx_opt.rcv_wscale;
286
287 /* If we advertise zero window, disable fast path. */
288 if (new_win == 0)
289 tp->pred_flags = 0;
290
291 return new_win;
292}
293
67edfef7 294/* Packet ECN state for a SYN-ACK */
056834d9 295static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb)
bdf1ee5d
IJ
296{
297 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR;
056834d9 298 if (!(tp->ecn_flags & TCP_ECN_OK))
bdf1ee5d
IJ
299 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE;
300}
301
67edfef7 302/* Packet ECN state for a SYN. */
bdf1ee5d
IJ
303static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
304{
305 struct tcp_sock *tp = tcp_sk(sk);
306
307 tp->ecn_flags = 0;
255cac91 308 if (sysctl_tcp_ecn == 1) {
056834d9 309 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR;
bdf1ee5d
IJ
310 tp->ecn_flags = TCP_ECN_OK;
311 }
312}
313
314static __inline__ void
315TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th)
316{
317 if (inet_rsk(req)->ecn_ok)
318 th->ece = 1;
319}
320
67edfef7
AK
321/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
322 * be sent.
323 */
bdf1ee5d
IJ
324static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
325 int tcp_header_len)
326{
327 struct tcp_sock *tp = tcp_sk(sk);
328
329 if (tp->ecn_flags & TCP_ECN_OK) {
330 /* Not-retransmitted data segment: set ECT and inject CWR. */
331 if (skb->len != tcp_header_len &&
332 !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
333 INET_ECN_xmit(sk);
056834d9 334 if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
bdf1ee5d
IJ
335 tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
336 tcp_hdr(skb)->cwr = 1;
337 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
338 }
339 } else {
340 /* ACK or retransmitted segment: clear ECT|CE */
341 INET_ECN_dontxmit(sk);
342 }
343 if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
344 tcp_hdr(skb)->ece = 1;
345 }
346}
347
e870a8ef
IJ
348/* Constructs common control bits of non-data skb. If SYN/FIN is present,
349 * auto increment end seqno.
350 */
351static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
352{
353 skb->csum = 0;
354
355 TCP_SKB_CB(skb)->flags = flags;
356 TCP_SKB_CB(skb)->sacked = 0;
357
358 skb_shinfo(skb)->gso_segs = 1;
359 skb_shinfo(skb)->gso_size = 0;
360 skb_shinfo(skb)->gso_type = 0;
361
362 TCP_SKB_CB(skb)->seq = seq;
363 if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN))
364 seq++;
365 TCP_SKB_CB(skb)->end_seq = seq;
366}
367
33f5f57e
IJ
368static inline int tcp_urg_mode(const struct tcp_sock *tp)
369{
370 return tp->snd_una != tp->snd_up;
371}
372
33ad798c
AL
373#define OPTION_SACK_ADVERTISE (1 << 0)
374#define OPTION_TS (1 << 1)
375#define OPTION_MD5 (1 << 2)
89e95a61 376#define OPTION_WSCALE (1 << 3)
bd0388ae 377#define OPTION_COOKIE_EXTENSION (1 << 4)
33ad798c
AL
378
379struct tcp_out_options {
380 u8 options; /* bit field of OPTION_* */
381 u8 ws; /* window scale, 0 to disable */
382 u8 num_sack_blocks; /* number of SACK blocks to include */
bd0388ae 383 u8 hash_size; /* bytes in hash_location */
33ad798c
AL
384 u16 mss; /* 0 to disable */
385 __u32 tsval, tsecr; /* need to include OPTION_TS */
bd0388ae 386 __u8 *hash_location; /* temporary pointer, overloaded */
33ad798c
AL
387};
388
bd0388ae
WAS
389/* The sysctl int routines are generic, so check consistency here.
390 */
391static u8 tcp_cookie_size_check(u8 desired)
392{
393 if (desired > 0) {
394 /* previously specified */
395 return desired;
396 }
397 if (sysctl_tcp_cookie_size <= 0) {
398 /* no default specified */
399 return 0;
400 }
401 if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) {
402 /* value too small, specify minimum */
403 return TCP_COOKIE_MIN;
404 }
405 if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) {
406 /* value too large, specify maximum */
407 return TCP_COOKIE_MAX;
408 }
409 if (0x1 & sysctl_tcp_cookie_size) {
410 /* 8-bit multiple, illegal, fix it */
411 return (u8)(sysctl_tcp_cookie_size + 0x1);
412 }
413 return (u8)sysctl_tcp_cookie_size;
414}
415
67edfef7
AK
416/* Write previously computed TCP options to the packet.
417 *
418 * Beware: Something in the Internet is very sensitive to the ordering of
fd6149d3
IJ
419 * TCP options, we learned this through the hard way, so be careful here.
420 * Luckily we can at least blame others for their non-compliance but from
421 * inter-operatibility perspective it seems that we're somewhat stuck with
422 * the ordering which we have been using if we want to keep working with
423 * those broken things (not that it currently hurts anybody as there isn't
424 * particular reason why the ordering would need to be changed).
425 *
426 * At least SACK_PERM as the first option is known to lead to a disaster
427 * (but it may well be that other scenarios fail similarly).
428 */
33ad798c 429static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
bd0388ae
WAS
430 struct tcp_out_options *opts)
431{
432 u8 options = opts->options; /* mungable copy */
433
434 /* Having both authentication and cookies for security is redundant,
435 * and there's certainly not enough room. Instead, the cookie-less
436 * extension variant is proposed.
437 *
438 * Consider the pessimal case with authentication. The options
439 * could look like:
440 * COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40
441 */
442 if (unlikely(OPTION_MD5 & options)) {
443 if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
444 *ptr++ = htonl((TCPOPT_COOKIE << 24) |
445 (TCPOLEN_COOKIE_BASE << 16) |
446 (TCPOPT_MD5SIG << 8) |
447 TCPOLEN_MD5SIG);
448 } else {
449 *ptr++ = htonl((TCPOPT_NOP << 24) |
450 (TCPOPT_NOP << 16) |
451 (TCPOPT_MD5SIG << 8) |
452 TCPOLEN_MD5SIG);
453 }
454 options &= ~OPTION_COOKIE_EXTENSION;
455 /* overload cookie hash location */
456 opts->hash_location = (__u8 *)ptr;
33ad798c 457 ptr += 4;
40efc6fa 458 }
33ad798c 459
fd6149d3
IJ
460 if (unlikely(opts->mss)) {
461 *ptr++ = htonl((TCPOPT_MSS << 24) |
462 (TCPOLEN_MSS << 16) |
463 opts->mss);
464 }
465
bd0388ae
WAS
466 if (likely(OPTION_TS & options)) {
467 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
33ad798c
AL
468 *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
469 (TCPOLEN_SACK_PERM << 16) |
470 (TCPOPT_TIMESTAMP << 8) |
471 TCPOLEN_TIMESTAMP);
bd0388ae 472 options &= ~OPTION_SACK_ADVERTISE;
33ad798c
AL
473 } else {
474 *ptr++ = htonl((TCPOPT_NOP << 24) |
475 (TCPOPT_NOP << 16) |
476 (TCPOPT_TIMESTAMP << 8) |
477 TCPOLEN_TIMESTAMP);
478 }
479 *ptr++ = htonl(opts->tsval);
480 *ptr++ = htonl(opts->tsecr);
481 }
482
bd0388ae
WAS
483 /* Specification requires after timestamp, so do it now.
484 *
485 * Consider the pessimal case without authentication. The options
486 * could look like:
487 * MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40
488 */
489 if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
490 __u8 *cookie_copy = opts->hash_location;
491 u8 cookie_size = opts->hash_size;
492
493 /* 8-bit multiple handled in tcp_cookie_size_check() above,
494 * and elsewhere.
495 */
496 if (0x2 & cookie_size) {
497 __u8 *p = (__u8 *)ptr;
498
499 /* 16-bit multiple */
500 *p++ = TCPOPT_COOKIE;
501 *p++ = TCPOLEN_COOKIE_BASE + cookie_size;
502 *p++ = *cookie_copy++;
503 *p++ = *cookie_copy++;
504 ptr++;
505 cookie_size -= 2;
506 } else {
507 /* 32-bit multiple */
508 *ptr++ = htonl(((TCPOPT_NOP << 24) |
509 (TCPOPT_NOP << 16) |
510 (TCPOPT_COOKIE << 8) |
511 TCPOLEN_COOKIE_BASE) +
512 cookie_size);
513 }
514
515 if (cookie_size > 0) {
516 memcpy(ptr, cookie_copy, cookie_size);
517 ptr += (cookie_size / 4);
518 }
519 }
520
521 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
33ad798c
AL
522 *ptr++ = htonl((TCPOPT_NOP << 24) |
523 (TCPOPT_NOP << 16) |
524 (TCPOPT_SACK_PERM << 8) |
525 TCPOLEN_SACK_PERM);
526 }
527
bd0388ae 528 if (unlikely(OPTION_WSCALE & options)) {
33ad798c
AL
529 *ptr++ = htonl((TCPOPT_NOP << 24) |
530 (TCPOPT_WINDOW << 16) |
531 (TCPOLEN_WINDOW << 8) |
532 opts->ws);
533 }
534
535 if (unlikely(opts->num_sack_blocks)) {
536 struct tcp_sack_block *sp = tp->rx_opt.dsack ?
537 tp->duplicate_sack : tp->selective_acks;
40efc6fa
SH
538 int this_sack;
539
540 *ptr++ = htonl((TCPOPT_NOP << 24) |
541 (TCPOPT_NOP << 16) |
542 (TCPOPT_SACK << 8) |
33ad798c 543 (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
40efc6fa 544 TCPOLEN_SACK_PERBLOCK)));
2de979bd 545
33ad798c
AL
546 for (this_sack = 0; this_sack < opts->num_sack_blocks;
547 ++this_sack) {
40efc6fa
SH
548 *ptr++ = htonl(sp[this_sack].start_seq);
549 *ptr++ = htonl(sp[this_sack].end_seq);
550 }
2de979bd 551
5861f8e5 552 tp->rx_opt.dsack = 0;
40efc6fa 553 }
33ad798c
AL
554}
555
67edfef7
AK
556/* Compute TCP options for SYN packets. This is not the final
557 * network wire format yet.
558 */
33ad798c
AL
559static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
560 struct tcp_out_options *opts,
561 struct tcp_md5sig_key **md5) {
562 struct tcp_sock *tp = tcp_sk(sk);
bd0388ae 563 struct tcp_cookie_values *cvp = tp->cookie_values;
bd0388ae
WAS
564 unsigned remaining = MAX_TCP_OPTION_SPACE;
565 u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
566 tcp_cookie_size_check(cvp->cookie_desired) :
567 0;
33ad798c 568
cfb6eeb4 569#ifdef CONFIG_TCP_MD5SIG
33ad798c
AL
570 *md5 = tp->af_specific->md5_lookup(sk, sk);
571 if (*md5) {
572 opts->options |= OPTION_MD5;
bd0388ae 573 remaining -= TCPOLEN_MD5SIG_ALIGNED;
cfb6eeb4 574 }
33ad798c
AL
575#else
576 *md5 = NULL;
cfb6eeb4 577#endif
33ad798c
AL
578
579 /* We always get an MSS option. The option bytes which will be seen in
580 * normal data packets should timestamps be used, must be in the MSS
581 * advertised. But we subtract them from tp->mss_cache so that
582 * calculations in tcp_sendmsg are simpler etc. So account for this
583 * fact here if necessary. If we don't do this correctly, as a
584 * receiver we won't recognize data packets as being full sized when we
585 * should, and thus we won't abide by the delayed ACK rules correctly.
586 * SACKs don't matter, we never delay an ACK when we have any of those
587 * going out. */
588 opts->mss = tcp_advertise_mss(sk);
bd0388ae 589 remaining -= TCPOLEN_MSS_ALIGNED;
33ad798c 590
bb5b7c11 591 if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
33ad798c
AL
592 opts->options |= OPTION_TS;
593 opts->tsval = TCP_SKB_CB(skb)->when;
594 opts->tsecr = tp->rx_opt.ts_recent;
bd0388ae 595 remaining -= TCPOLEN_TSTAMP_ALIGNED;
33ad798c 596 }
bb5b7c11 597 if (likely(sysctl_tcp_window_scaling)) {
33ad798c 598 opts->ws = tp->rx_opt.rcv_wscale;
89e95a61 599 opts->options |= OPTION_WSCALE;
bd0388ae 600 remaining -= TCPOLEN_WSCALE_ALIGNED;
33ad798c 601 }
bb5b7c11 602 if (likely(sysctl_tcp_sack)) {
33ad798c 603 opts->options |= OPTION_SACK_ADVERTISE;
b32d1310 604 if (unlikely(!(OPTION_TS & opts->options)))
bd0388ae 605 remaining -= TCPOLEN_SACKPERM_ALIGNED;
33ad798c
AL
606 }
607
bd0388ae
WAS
608 /* Note that timestamps are required by the specification.
609 *
610 * Odd numbers of bytes are prohibited by the specification, ensuring
611 * that the cookie is 16-bit aligned, and the resulting cookie pair is
612 * 32-bit aligned.
613 */
614 if (*md5 == NULL &&
615 (OPTION_TS & opts->options) &&
616 cookie_size > 0) {
617 int need = TCPOLEN_COOKIE_BASE + cookie_size;
618
619 if (0x2 & need) {
620 /* 32-bit multiple */
621 need += 2; /* NOPs */
622
623 if (need > remaining) {
624 /* try shrinking cookie to fit */
625 cookie_size -= 2;
626 need -= 4;
627 }
628 }
629 while (need > remaining && TCP_COOKIE_MIN <= cookie_size) {
630 cookie_size -= 4;
631 need -= 4;
632 }
633 if (TCP_COOKIE_MIN <= cookie_size) {
634 opts->options |= OPTION_COOKIE_EXTENSION;
635 opts->hash_location = (__u8 *)&cvp->cookie_pair[0];
636 opts->hash_size = cookie_size;
637
638 /* Remember for future incarnations. */
639 cvp->cookie_desired = cookie_size;
640
641 if (cvp->cookie_desired != cvp->cookie_pair_size) {
642 /* Currently use random bytes as a nonce,
643 * assuming these are completely unpredictable
644 * by hostile users of the same system.
645 */
646 get_random_bytes(&cvp->cookie_pair[0],
647 cookie_size);
648 cvp->cookie_pair_size = cookie_size;
649 }
650
651 remaining -= need;
652 }
653 }
654 return MAX_TCP_OPTION_SPACE - remaining;
40efc6fa
SH
655}
656
67edfef7 657/* Set up TCP options for SYN-ACKs. */
33ad798c
AL
658static unsigned tcp_synack_options(struct sock *sk,
659 struct request_sock *req,
660 unsigned mss, struct sk_buff *skb,
661 struct tcp_out_options *opts,
4957faad
WAS
662 struct tcp_md5sig_key **md5,
663 struct tcp_extend_values *xvp)
664{
33ad798c 665 struct inet_request_sock *ireq = inet_rsk(req);
4957faad
WAS
666 unsigned remaining = MAX_TCP_OPTION_SPACE;
667 u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
668 xvp->cookie_plus :
669 0;
670 bool doing_ts = ireq->tstamp_ok;
33ad798c 671
cfb6eeb4 672#ifdef CONFIG_TCP_MD5SIG
33ad798c
AL
673 *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
674 if (*md5) {
675 opts->options |= OPTION_MD5;
4957faad
WAS
676 remaining -= TCPOLEN_MD5SIG_ALIGNED;
677
678 /* We can't fit any SACK blocks in a packet with MD5 + TS
679 * options. There was discussion about disabling SACK
680 * rather than TS in order to fit in better with old,
681 * buggy kernels, but that was deemed to be unnecessary.
682 */
683 doing_ts &= !ireq->sack_ok;
cfb6eeb4 684 }
33ad798c
AL
685#else
686 *md5 = NULL;
cfb6eeb4 687#endif
33ad798c 688
4957faad 689 /* We always send an MSS option. */
33ad798c 690 opts->mss = mss;
4957faad 691 remaining -= TCPOLEN_MSS_ALIGNED;
33ad798c
AL
692
693 if (likely(ireq->wscale_ok)) {
694 opts->ws = ireq->rcv_wscale;
89e95a61 695 opts->options |= OPTION_WSCALE;
4957faad 696 remaining -= TCPOLEN_WSCALE_ALIGNED;
33ad798c
AL
697 }
698 if (likely(doing_ts)) {
699 opts->options |= OPTION_TS;
700 opts->tsval = TCP_SKB_CB(skb)->when;
701 opts->tsecr = req->ts_recent;
4957faad 702 remaining -= TCPOLEN_TSTAMP_ALIGNED;
33ad798c
AL
703 }
704 if (likely(ireq->sack_ok)) {
705 opts->options |= OPTION_SACK_ADVERTISE;
706 if (unlikely(!doing_ts))
4957faad 707 remaining -= TCPOLEN_SACKPERM_ALIGNED;
33ad798c
AL
708 }
709
4957faad
WAS
710 /* Similar rationale to tcp_syn_options() applies here, too.
711 * If the <SYN> options fit, the same options should fit now!
712 */
713 if (*md5 == NULL &&
714 doing_ts &&
715 cookie_plus > TCPOLEN_COOKIE_BASE) {
716 int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */
717
718 if (0x2 & need) {
719 /* 32-bit multiple */
720 need += 2; /* NOPs */
721 }
722 if (need <= remaining) {
723 opts->options |= OPTION_COOKIE_EXTENSION;
724 opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE;
725 remaining -= need;
726 } else {
727 /* There's no error return, so flag it. */
728 xvp->cookie_out_never = 1; /* true */
729 opts->hash_size = 0;
730 }
731 }
732 return MAX_TCP_OPTION_SPACE - remaining;
33ad798c
AL
733}
734
67edfef7
AK
735/* Compute TCP options for ESTABLISHED sockets. This is not the
736 * final wire format yet.
737 */
33ad798c
AL
738static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
739 struct tcp_out_options *opts,
740 struct tcp_md5sig_key **md5) {
741 struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
742 struct tcp_sock *tp = tcp_sk(sk);
743 unsigned size = 0;
cabeccbd 744 unsigned int eff_sacks;
33ad798c
AL
745
746#ifdef CONFIG_TCP_MD5SIG
747 *md5 = tp->af_specific->md5_lookup(sk, sk);
748 if (unlikely(*md5)) {
749 opts->options |= OPTION_MD5;
750 size += TCPOLEN_MD5SIG_ALIGNED;
751 }
752#else
753 *md5 = NULL;
754#endif
755
756 if (likely(tp->rx_opt.tstamp_ok)) {
757 opts->options |= OPTION_TS;
758 opts->tsval = tcb ? tcb->when : 0;
759 opts->tsecr = tp->rx_opt.ts_recent;
760 size += TCPOLEN_TSTAMP_ALIGNED;
761 }
762
cabeccbd
IJ
763 eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
764 if (unlikely(eff_sacks)) {
33ad798c
AL
765 const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
766 opts->num_sack_blocks =
cabeccbd 767 min_t(unsigned, eff_sacks,
33ad798c
AL
768 (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
769 TCPOLEN_SACK_PERBLOCK);
770 size += TCPOLEN_SACK_BASE_ALIGNED +
771 opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
772 }
773
774 return size;
40efc6fa 775}
1da177e4
LT
776
777/* This routine actually transmits TCP packets queued in by
778 * tcp_do_sendmsg(). This is used by both the initial
779 * transmission and possible later retransmissions.
780 * All SKB's seen here are completely headerless. It is our
781 * job to build the TCP header, and pass the packet down to
782 * IP so it can do the same plus pass the packet off to the
783 * device.
784 *
785 * We are working here with either a clone of the original
786 * SKB, or a fresh unique copy made by the retransmit engine.
787 */
056834d9
IJ
788static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
789 gfp_t gfp_mask)
1da177e4 790{
dfb4b9dc
DM
791 const struct inet_connection_sock *icsk = inet_csk(sk);
792 struct inet_sock *inet;
793 struct tcp_sock *tp;
794 struct tcp_skb_cb *tcb;
33ad798c
AL
795 struct tcp_out_options opts;
796 unsigned tcp_options_size, tcp_header_size;
cfb6eeb4 797 struct tcp_md5sig_key *md5;
dfb4b9dc 798 struct tcphdr *th;
dfb4b9dc
DM
799 int err;
800
801 BUG_ON(!skb || !tcp_skb_pcount(skb));
802
803 /* If congestion control is doing timestamping, we must
804 * take such a timestamp before we potentially clone/copy.
805 */
164891aa 806 if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
dfb4b9dc
DM
807 __net_timestamp(skb);
808
809 if (likely(clone_it)) {
810 if (unlikely(skb_cloned(skb)))
811 skb = pskb_copy(skb, gfp_mask);
812 else
813 skb = skb_clone(skb, gfp_mask);
814 if (unlikely(!skb))
815 return -ENOBUFS;
816 }
1da177e4 817
dfb4b9dc
DM
818 inet = inet_sk(sk);
819 tp = tcp_sk(sk);
820 tcb = TCP_SKB_CB(skb);
33ad798c 821 memset(&opts, 0, sizeof(opts));
1da177e4 822
33ad798c
AL
823 if (unlikely(tcb->flags & TCPCB_FLAG_SYN))
824 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
825 else
826 tcp_options_size = tcp_established_options(sk, skb, &opts,
827 &md5);
828 tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
e905a9ed 829
dfb4b9dc
DM
830 if (tcp_packets_in_flight(tp) == 0)
831 tcp_ca_event(sk, CA_EVENT_TX_START);
832
aa8223c7
ACM
833 skb_push(skb, tcp_header_size);
834 skb_reset_transport_header(skb);
e89862f4 835 skb_set_owner_w(skb, sk);
dfb4b9dc
DM
836
837 /* Build TCP header and checksum it. */
aa8223c7 838 th = tcp_hdr(skb);
c720c7e8
ED
839 th->source = inet->inet_sport;
840 th->dest = inet->inet_dport;
dfb4b9dc
DM
841 th->seq = htonl(tcb->seq);
842 th->ack_seq = htonl(tp->rcv_nxt);
df7a3b07 843 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
dfb4b9dc
DM
844 tcb->flags);
845
846 if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
847 /* RFC1323: The window in SYN & SYN/ACK segments
848 * is never scaled.
849 */
600ff0c2 850 th->window = htons(min(tp->rcv_wnd, 65535U));
dfb4b9dc
DM
851 } else {
852 th->window = htons(tcp_select_window(sk));
853 }
854 th->check = 0;
855 th->urg_ptr = 0;
1da177e4 856
33f5f57e 857 /* The urg_mode check is necessary during a below snd_una win probe */
7691367d
HX
858 if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
859 if (before(tp->snd_up, tcb->seq + 0x10000)) {
860 th->urg_ptr = htons(tp->snd_up - tcb->seq);
861 th->urg = 1;
862 } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
863 th->urg_ptr = 0xFFFF;
864 th->urg = 1;
865 }
dfb4b9dc 866 }
1da177e4 867
bd0388ae 868 tcp_options_write((__be32 *)(th + 1), tp, &opts);
33ad798c 869 if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0))
9e412ba7 870 TCP_ECN_send(sk, skb, tcp_header_size);
1da177e4 871
cfb6eeb4
YH
872#ifdef CONFIG_TCP_MD5SIG
873 /* Calculate the MD5 hash, as we have all we need now */
874 if (md5) {
33ad798c 875 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
bd0388ae 876 tp->af_specific->calc_md5_hash(opts.hash_location,
49a72dfb 877 md5, sk, NULL, skb);
cfb6eeb4
YH
878 }
879#endif
880
bb296246 881 icsk->icsk_af_ops->send_check(sk, skb);
1da177e4 882
dfb4b9dc
DM
883 if (likely(tcb->flags & TCPCB_FLAG_ACK))
884 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
1da177e4 885
dfb4b9dc
DM
886 if (skb->len != tcp_header_size)
887 tcp_event_data_sent(tp, skb, sk);
1da177e4 888
bd37a088 889 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
81cc8a75 890 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
1da177e4 891
e89862f4 892 err = icsk->icsk_af_ops->queue_xmit(skb, 0);
83de47cd 893 if (likely(err <= 0))
dfb4b9dc
DM
894 return err;
895
3cfe3baa 896 tcp_enter_cwr(sk, 1);
dfb4b9dc 897
b9df3cb8 898 return net_xmit_eval(err);
1da177e4
LT
899}
900
67edfef7 901/* This routine just queues the buffer for sending.
1da177e4
LT
902 *
903 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
904 * otherwise socket can stall.
905 */
906static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
907{
908 struct tcp_sock *tp = tcp_sk(sk);
909
910 /* Advance write_seq and place onto the write_queue. */
911 tp->write_seq = TCP_SKB_CB(skb)->end_seq;
912 skb_header_release(skb);
fe067e8a 913 tcp_add_write_queue_tail(sk, skb);
3ab224be
HA
914 sk->sk_wmem_queued += skb->truesize;
915 sk_mem_charge(sk, skb->truesize);
1da177e4
LT
916}
917
67edfef7 918/* Initialize TSO segments for a packet. */
056834d9
IJ
919static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb,
920 unsigned int mss_now)
f6302d1d 921{
8e5b9dda
HX
922 if (skb->len <= mss_now || !sk_can_gso(sk) ||
923 skb->ip_summed == CHECKSUM_NONE) {
f6302d1d
DM
924 /* Avoid the costly divide in the normal
925 * non-TSO case.
926 */
7967168c
HX
927 skb_shinfo(skb)->gso_segs = 1;
928 skb_shinfo(skb)->gso_size = 0;
929 skb_shinfo(skb)->gso_type = 0;
f6302d1d 930 } else {
356f89e1 931 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
7967168c 932 skb_shinfo(skb)->gso_size = mss_now;
bcd76111 933 skb_shinfo(skb)->gso_type = sk->sk_gso_type;
1da177e4
LT
934 }
935}
936
91fed7a1 937/* When a modification to fackets out becomes necessary, we need to check
68f8353b 938 * skb is counted to fackets_out or not.
91fed7a1 939 */
a47e5a98 940static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb,
91fed7a1
IJ
941 int decr)
942{
a47e5a98
IJ
943 struct tcp_sock *tp = tcp_sk(sk);
944
dc86967b 945 if (!tp->sacked_out || tcp_is_reno(tp))
91fed7a1
IJ
946 return;
947
6859d494 948 if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
91fed7a1 949 tp->fackets_out -= decr;
91fed7a1
IJ
950}
951
797108d1
IJ
952/* Pcount in the middle of the write queue got changed, we need to do various
953 * tweaks to fix counters
954 */
955static void tcp_adjust_pcount(struct sock *sk, struct sk_buff *skb, int decr)
956{
957 struct tcp_sock *tp = tcp_sk(sk);
958
959 tp->packets_out -= decr;
960
961 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
962 tp->sacked_out -= decr;
963 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
964 tp->retrans_out -= decr;
965 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
966 tp->lost_out -= decr;
967
968 /* Reno case is special. Sigh... */
969 if (tcp_is_reno(tp) && decr > 0)
970 tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
971
972 tcp_adjust_fackets_out(sk, skb, decr);
973
974 if (tp->lost_skb_hint &&
975 before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
52cf3cc8 976 (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)))
797108d1
IJ
977 tp->lost_cnt_hint -= decr;
978
979 tcp_verify_left_out(tp);
980}
981
1da177e4
LT
982/* Function to create two new TCP segments. Shrinks the given segment
983 * to the specified size and appends a new segment with the rest of the
e905a9ed 984 * packet to the list. This won't be called frequently, I hope.
1da177e4
LT
985 * Remember, these are still headerless SKBs at this point.
986 */
056834d9
IJ
987int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
988 unsigned int mss_now)
1da177e4
LT
989{
990 struct tcp_sock *tp = tcp_sk(sk);
991 struct sk_buff *buff;
6475be16 992 int nsize, old_factor;
b60b49ea 993 int nlen;
9ce01461 994 u8 flags;
1da177e4 995
b2cc99f0 996 BUG_ON(len > skb->len);
6a438bbe 997
1da177e4
LT
998 nsize = skb_headlen(skb) - len;
999 if (nsize < 0)
1000 nsize = 0;
1001
1002 if (skb_cloned(skb) &&
1003 skb_is_nonlinear(skb) &&
1004 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
1005 return -ENOMEM;
1006
1007 /* Get a new skb... force flag on. */
1008 buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
1009 if (buff == NULL)
1010 return -ENOMEM; /* We'll just try again later. */
ef5cb973 1011
3ab224be
HA
1012 sk->sk_wmem_queued += buff->truesize;
1013 sk_mem_charge(sk, buff->truesize);
b60b49ea
HX
1014 nlen = skb->len - len - nsize;
1015 buff->truesize += nlen;
1016 skb->truesize -= nlen;
1da177e4
LT
1017
1018 /* Correct the sequence numbers. */
1019 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
1020 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
1021 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1022
1023 /* PSH and FIN should only be set in the second packet. */
1024 flags = TCP_SKB_CB(skb)->flags;
056834d9 1025 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
1da177e4 1026 TCP_SKB_CB(buff)->flags = flags;
e14c3caf 1027 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
1da177e4 1028
84fa7933 1029 if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
1da177e4 1030 /* Copy and checksum data tail into the new buffer. */
056834d9
IJ
1031 buff->csum = csum_partial_copy_nocheck(skb->data + len,
1032 skb_put(buff, nsize),
1da177e4
LT
1033 nsize, 0);
1034
1035 skb_trim(skb, len);
1036
1037 skb->csum = csum_block_sub(skb->csum, buff->csum, len);
1038 } else {
84fa7933 1039 skb->ip_summed = CHECKSUM_PARTIAL;
1da177e4
LT
1040 skb_split(skb, buff, len);
1041 }
1042
1043 buff->ip_summed = skb->ip_summed;
1044
1045 /* Looks stupid, but our code really uses when of
1046 * skbs, which it never sent before. --ANK
1047 */
1048 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
a61bbcf2 1049 buff->tstamp = skb->tstamp;
1da177e4 1050
6475be16
DM
1051 old_factor = tcp_skb_pcount(skb);
1052
1da177e4 1053 /* Fix up tso_factor for both original and new SKB. */
846998ae
DM
1054 tcp_set_skb_tso_segs(sk, skb, mss_now);
1055 tcp_set_skb_tso_segs(sk, buff, mss_now);
1da177e4 1056
6475be16
DM
1057 /* If this packet has been sent out already, we must
1058 * adjust the various packet counters.
1059 */
cf0b450c 1060 if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
6475be16
DM
1061 int diff = old_factor - tcp_skb_pcount(skb) -
1062 tcp_skb_pcount(buff);
1da177e4 1063
797108d1
IJ
1064 if (diff)
1065 tcp_adjust_pcount(sk, skb, diff);
1da177e4
LT
1066 }
1067
1068 /* Link BUFF into the send queue. */
f44b5271 1069 skb_header_release(buff);
fe067e8a 1070 tcp_insert_write_queue_after(skb, buff, sk);
1da177e4
LT
1071
1072 return 0;
1073}
1074
1075/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
1076 * eventually). The difference is that pulled data not copied, but
1077 * immediately discarded.
1078 */
f2911969 1079static void __pskb_trim_head(struct sk_buff *skb, int len)
1da177e4
LT
1080{
1081 int i, k, eat;
1082
1083 eat = len;
1084 k = 0;
056834d9 1085 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1da177e4
LT
1086 if (skb_shinfo(skb)->frags[i].size <= eat) {
1087 put_page(skb_shinfo(skb)->frags[i].page);
1088 eat -= skb_shinfo(skb)->frags[i].size;
1089 } else {
1090 skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
1091 if (eat) {
1092 skb_shinfo(skb)->frags[k].page_offset += eat;
1093 skb_shinfo(skb)->frags[k].size -= eat;
1094 eat = 0;
1095 }
1096 k++;
1097 }
1098 }
1099 skb_shinfo(skb)->nr_frags = k;
1100
27a884dc 1101 skb_reset_tail_pointer(skb);
1da177e4
LT
1102 skb->data_len -= len;
1103 skb->len = skb->data_len;
1da177e4
LT
1104}
1105
67edfef7 1106/* Remove acked data from a packet in the transmit queue. */
1da177e4
LT
1107int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
1108{
056834d9 1109 if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
1da177e4
LT
1110 return -ENOMEM;
1111
f2911969
HXP
1112 /* If len == headlen, we avoid __skb_pull to preserve alignment. */
1113 if (unlikely(len < skb_headlen(skb)))
1da177e4 1114 __skb_pull(skb, len);
f2911969
HXP
1115 else
1116 __pskb_trim_head(skb, len - skb_headlen(skb));
1da177e4
LT
1117
1118 TCP_SKB_CB(skb)->seq += len;
84fa7933 1119 skb->ip_summed = CHECKSUM_PARTIAL;
1da177e4
LT
1120
1121 skb->truesize -= len;
1122 sk->sk_wmem_queued -= len;
3ab224be 1123 sk_mem_uncharge(sk, len);
1da177e4
LT
1124 sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
1125
1126 /* Any change of skb->len requires recalculation of tso
1127 * factor and mss.
1128 */
1129 if (tcp_skb_pcount(skb) > 1)
0c54b85f 1130 tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk));
1da177e4
LT
1131
1132 return 0;
1133}
1134
67edfef7 1135/* Calculate MSS. Not accounting for SACKs here. */
5d424d5a
JH
1136int tcp_mtu_to_mss(struct sock *sk, int pmtu)
1137{
1138 struct tcp_sock *tp = tcp_sk(sk);
1139 struct inet_connection_sock *icsk = inet_csk(sk);
1140 int mss_now;
1141
1142 /* Calculate base mss without TCP options:
1143 It is MMS_S - sizeof(tcphdr) of rfc1122
1144 */
1145 mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
1146
1147 /* Clamp it (mss_clamp does not include tcp options) */
1148 if (mss_now > tp->rx_opt.mss_clamp)
1149 mss_now = tp->rx_opt.mss_clamp;
1150
1151 /* Now subtract optional transport overhead */
1152 mss_now -= icsk->icsk_ext_hdr_len;
1153
1154 /* Then reserve room for full set of TCP options and 8 bytes of data */
1155 if (mss_now < 48)
1156 mss_now = 48;
1157
1158 /* Now subtract TCP options size, not including SACKs */
1159 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
1160
1161 return mss_now;
1162}
1163
1164/* Inverse of above */
1165int tcp_mss_to_mtu(struct sock *sk, int mss)
1166{
1167 struct tcp_sock *tp = tcp_sk(sk);
1168 struct inet_connection_sock *icsk = inet_csk(sk);
1169 int mtu;
1170
1171 mtu = mss +
1172 tp->tcp_header_len +
1173 icsk->icsk_ext_hdr_len +
1174 icsk->icsk_af_ops->net_header_len;
1175
1176 return mtu;
1177}
1178
67edfef7 1179/* MTU probing init per socket */
5d424d5a
JH
1180void tcp_mtup_init(struct sock *sk)
1181{
1182 struct tcp_sock *tp = tcp_sk(sk);
1183 struct inet_connection_sock *icsk = inet_csk(sk);
1184
1185 icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
1186 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
e905a9ed 1187 icsk->icsk_af_ops->net_header_len;
5d424d5a
JH
1188 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
1189 icsk->icsk_mtup.probe_size = 0;
1190}
1191
1da177e4
LT
1192/* This function synchronize snd mss to current pmtu/exthdr set.
1193
1194 tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
1195 for TCP options, but includes only bare TCP header.
1196
1197 tp->rx_opt.mss_clamp is mss negotiated at connection setup.
caa20d9a 1198 It is minimum of user_mss and mss received with SYN.
1da177e4
LT
1199 It also does not include TCP options.
1200
d83d8461 1201 inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
1da177e4
LT
1202
1203 tp->mss_cache is current effective sending mss, including
1204 all tcp options except for SACKs. It is evaluated,
1205 taking into account current pmtu, but never exceeds
1206 tp->rx_opt.mss_clamp.
1207
1208 NOTE1. rfc1122 clearly states that advertised MSS
1209 DOES NOT include either tcp or ip options.
1210
d83d8461
ACM
1211 NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
1212 are READ ONLY outside this function. --ANK (980731)
1da177e4 1213 */
1da177e4
LT
1214unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
1215{
1216 struct tcp_sock *tp = tcp_sk(sk);
d83d8461 1217 struct inet_connection_sock *icsk = inet_csk(sk);
5d424d5a 1218 int mss_now;
1da177e4 1219
5d424d5a
JH
1220 if (icsk->icsk_mtup.search_high > pmtu)
1221 icsk->icsk_mtup.search_high = pmtu;
1da177e4 1222
5d424d5a 1223 mss_now = tcp_mtu_to_mss(sk, pmtu);
409d22b4 1224 mss_now = tcp_bound_to_half_wnd(tp, mss_now);
1da177e4
LT
1225
1226 /* And store cached results */
d83d8461 1227 icsk->icsk_pmtu_cookie = pmtu;
5d424d5a
JH
1228 if (icsk->icsk_mtup.enabled)
1229 mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
c1b4a7e6 1230 tp->mss_cache = mss_now;
1da177e4
LT
1231
1232 return mss_now;
1233}
1234
1235/* Compute the current effective MSS, taking SACKs and IP options,
1236 * and even PMTU discovery events into account.
1da177e4 1237 */
0c54b85f 1238unsigned int tcp_current_mss(struct sock *sk)
1da177e4
LT
1239{
1240 struct tcp_sock *tp = tcp_sk(sk);
1241 struct dst_entry *dst = __sk_dst_get(sk);
c1b4a7e6 1242 u32 mss_now;
33ad798c
AL
1243 unsigned header_len;
1244 struct tcp_out_options opts;
1245 struct tcp_md5sig_key *md5;
c1b4a7e6
DM
1246
1247 mss_now = tp->mss_cache;
1248
1da177e4
LT
1249 if (dst) {
1250 u32 mtu = dst_mtu(dst);
d83d8461 1251 if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
1da177e4
LT
1252 mss_now = tcp_sync_mss(sk, mtu);
1253 }
1254
33ad798c
AL
1255 header_len = tcp_established_options(sk, NULL, &opts, &md5) +
1256 sizeof(struct tcphdr);
1257 /* The mss_cache is sized based on tp->tcp_header_len, which assumes
1258 * some common options. If this is an odd packet (because we have SACK
1259 * blocks etc) then our calculated header_len will be different, and
1260 * we have to adjust mss_now correspondingly */
1261 if (header_len != tp->tcp_header_len) {
1262 int delta = (int) header_len - tp->tcp_header_len;
1263 mss_now -= delta;
1264 }
cfb6eeb4 1265
1da177e4
LT
1266 return mss_now;
1267}
1268
a762a980 1269/* Congestion window validation. (RFC2861) */
9e412ba7 1270static void tcp_cwnd_validate(struct sock *sk)
a762a980 1271{
9e412ba7 1272 struct tcp_sock *tp = tcp_sk(sk);
a762a980 1273
d436d686 1274 if (tp->packets_out >= tp->snd_cwnd) {
a762a980
DM
1275 /* Network is feed fully. */
1276 tp->snd_cwnd_used = 0;
1277 tp->snd_cwnd_stamp = tcp_time_stamp;
1278 } else {
1279 /* Network starves. */
1280 if (tp->packets_out > tp->snd_cwnd_used)
1281 tp->snd_cwnd_used = tp->packets_out;
1282
15d33c07
DM
1283 if (sysctl_tcp_slow_start_after_idle &&
1284 (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
a762a980
DM
1285 tcp_cwnd_application_limited(sk);
1286 }
1287}
1288
0e3a4803
IJ
1289/* Returns the portion of skb which can be sent right away without
1290 * introducing MSS oddities to segment boundaries. In rare cases where
1291 * mss_now != mss_cache, we will request caller to create a small skb
1292 * per input skb which could be mostly avoided here (if desired).
5ea3a748
IJ
1293 *
1294 * We explicitly want to create a request for splitting write queue tail
1295 * to a small skb for Nagle purposes while avoiding unnecessary modulos,
1296 * thus all the complexity (cwnd_len is always MSS multiple which we
1297 * return whenever allowed by the other factors). Basically we need the
1298 * modulo only when the receiver window alone is the limiting factor or
1299 * when we would be allowed to send the split-due-to-Nagle skb fully.
0e3a4803
IJ
1300 */
1301static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb,
056834d9 1302 unsigned int mss_now, unsigned int cwnd)
c1b4a7e6 1303{
0e3a4803
IJ
1304 struct tcp_sock *tp = tcp_sk(sk);
1305 u32 needed, window, cwnd_len;
c1b4a7e6 1306
90840def 1307 window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
c1b4a7e6 1308 cwnd_len = mss_now * cwnd;
0e3a4803
IJ
1309
1310 if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk)))
1311 return cwnd_len;
1312
5ea3a748
IJ
1313 needed = min(skb->len, window);
1314
17515408 1315 if (cwnd_len <= needed)
0e3a4803
IJ
1316 return cwnd_len;
1317
0e3a4803 1318 return needed - needed % mss_now;
c1b4a7e6
DM
1319}
1320
1321/* Can at least one segment of SKB be sent right now, according to the
1322 * congestion window rules? If so, return how many segments are allowed.
1323 */
056834d9
IJ
1324static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
1325 struct sk_buff *skb)
c1b4a7e6
DM
1326{
1327 u32 in_flight, cwnd;
1328
1329 /* Don't be strict about the congestion window for the final FIN. */
104439a8
JH
1330 if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
1331 tcp_skb_pcount(skb) == 1)
c1b4a7e6
DM
1332 return 1;
1333
1334 in_flight = tcp_packets_in_flight(tp);
1335 cwnd = tp->snd_cwnd;
1336 if (in_flight < cwnd)
1337 return (cwnd - in_flight);
1338
1339 return 0;
1340}
1341
67edfef7
AK
1342/* Intialize TSO state of a skb.
1343 * This must be invoked the first time we consider transmitting
c1b4a7e6
DM
1344 * SKB onto the wire.
1345 */
056834d9
IJ
1346static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb,
1347 unsigned int mss_now)
c1b4a7e6
DM
1348{
1349 int tso_segs = tcp_skb_pcount(skb);
1350
f8269a49 1351 if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
846998ae 1352 tcp_set_skb_tso_segs(sk, skb, mss_now);
c1b4a7e6
DM
1353 tso_segs = tcp_skb_pcount(skb);
1354 }
1355 return tso_segs;
1356}
1357
67edfef7 1358/* Minshall's variant of the Nagle send check. */
c1b4a7e6
DM
1359static inline int tcp_minshall_check(const struct tcp_sock *tp)
1360{
09cb105e 1361 return after(tp->snd_sml, tp->snd_una) &&
c1b4a7e6
DM
1362 !after(tp->snd_sml, tp->snd_nxt);
1363}
1364
1365/* Return 0, if packet can be sent now without violation Nagle's rules:
1366 * 1. It is full sized.
1367 * 2. Or it contains FIN. (already checked by caller)
1368 * 3. Or TCP_NODELAY was set.
1369 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
1370 * With Minshall's modification: all sent small packets are ACKed.
1371 */
c1b4a7e6 1372static inline int tcp_nagle_check(const struct tcp_sock *tp,
e905a9ed 1373 const struct sk_buff *skb,
c1b4a7e6
DM
1374 unsigned mss_now, int nonagle)
1375{
1376 return (skb->len < mss_now &&
056834d9
IJ
1377 ((nonagle & TCP_NAGLE_CORK) ||
1378 (!nonagle && tp->packets_out && tcp_minshall_check(tp))));
c1b4a7e6
DM
1379}
1380
1381/* Return non-zero if the Nagle test allows this packet to be
1382 * sent now.
1383 */
1384static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
1385 unsigned int cur_mss, int nonagle)
1386{
1387 /* Nagle rule does not apply to frames, which sit in the middle of the
1388 * write_queue (they have no chances to get new data).
1389 *
1390 * This is implemented in the callers, where they modify the 'nonagle'
1391 * argument based upon the location of SKB in the send queue.
1392 */
1393 if (nonagle & TCP_NAGLE_PUSH)
1394 return 1;
1395
d551e454
IJ
1396 /* Don't use the nagle rule for urgent data (or for the final FIN).
1397 * Nagle can be ignored during F-RTO too (see RFC4138).
1398 */
33f5f57e 1399 if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
c1b4a7e6
DM
1400 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
1401 return 1;
1402
1403 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
1404 return 1;
1405
1406 return 0;
1407}
1408
1409/* Does at least the first segment of SKB fit into the send window? */
056834d9
IJ
1410static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb,
1411 unsigned int cur_mss)
c1b4a7e6
DM
1412{
1413 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
1414
1415 if (skb->len > cur_mss)
1416 end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
1417
90840def 1418 return !after(end_seq, tcp_wnd_end(tp));
c1b4a7e6
DM
1419}
1420
fe067e8a 1421/* This checks if the data bearing packet SKB (usually tcp_send_head(sk))
c1b4a7e6
DM
1422 * should be put on the wire right now. If so, it returns the number of
1423 * packets allowed by the congestion window.
1424 */
1425static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
1426 unsigned int cur_mss, int nonagle)
1427{
1428 struct tcp_sock *tp = tcp_sk(sk);
1429 unsigned int cwnd_quota;
1430
846998ae 1431 tcp_init_tso_segs(sk, skb, cur_mss);
c1b4a7e6
DM
1432
1433 if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
1434 return 0;
1435
1436 cwnd_quota = tcp_cwnd_test(tp, skb);
056834d9 1437 if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss))
c1b4a7e6
DM
1438 cwnd_quota = 0;
1439
1440 return cwnd_quota;
1441}
1442
67edfef7 1443/* Test if sending is allowed right now. */
9e412ba7 1444int tcp_may_send_now(struct sock *sk)
c1b4a7e6 1445{
9e412ba7 1446 struct tcp_sock *tp = tcp_sk(sk);
fe067e8a 1447 struct sk_buff *skb = tcp_send_head(sk);
c1b4a7e6
DM
1448
1449 return (skb &&
0c54b85f 1450 tcp_snd_test(sk, skb, tcp_current_mss(sk),
c1b4a7e6 1451 (tcp_skb_is_last(sk, skb) ?