Commit | Line | Data |
---|---|---|
457c8996 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
317a76f9 | 2 | /* |
b92022f3 | 3 | * Pluggable TCP congestion control support and newReno |
317a76f9 | 4 | * congestion control. |
02582e9b | 5 | * Based on ideas from I/O scheduler support and Web100. |
317a76f9 SH |
6 | * |
7 | * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> | |
8 | */ | |
9 | ||
afd46503 JP |
10 | #define pr_fmt(fmt) "TCP: " fmt |
11 | ||
317a76f9 SH |
12 | #include <linux/module.h> |
13 | #include <linux/mm.h> | |
14 | #include <linux/types.h> | |
15 | #include <linux/list.h> | |
5a0e3ad6 | 16 | #include <linux/gfp.h> |
c5c6a8ab | 17 | #include <linux/jhash.h> |
317a76f9 SH |
18 | #include <net/tcp.h> |
19 | ||
20 | static DEFINE_SPINLOCK(tcp_cong_list_lock); | |
21 | static LIST_HEAD(tcp_cong_list); | |
22 | ||
23 | /* Simple linear search, don't expect many entries! */ | |
24 | static struct tcp_congestion_ops *tcp_ca_find(const char *name) | |
25 | { | |
26 | struct tcp_congestion_ops *e; | |
27 | ||
5f8ef48d | 28 | list_for_each_entry_rcu(e, &tcp_cong_list, list) { |
317a76f9 SH |
29 | if (strcmp(e->name, name) == 0) |
30 | return e; | |
31 | } | |
32 | ||
33 | return NULL; | |
34 | } | |
35 | ||
c5c6a8ab | 36 | /* Must be called with rcu lock held */ |
6670e152 SH |
37 | static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net, |
38 | const char *name) | |
c5c6a8ab | 39 | { |
6670e152 SH |
40 | struct tcp_congestion_ops *ca = tcp_ca_find(name); |
41 | ||
c5c6a8ab DB |
42 | #ifdef CONFIG_MODULES |
43 | if (!ca && capable(CAP_NET_ADMIN)) { | |
44 | rcu_read_unlock(); | |
45 | request_module("tcp_%s", name); | |
46 | rcu_read_lock(); | |
47 | ca = tcp_ca_find(name); | |
48 | } | |
49 | #endif | |
50 | return ca; | |
51 | } | |
52 | ||
53 | /* Simple linear search, not much in here. */ | |
54 | struct tcp_congestion_ops *tcp_ca_find_key(u32 key) | |
55 | { | |
56 | struct tcp_congestion_ops *e; | |
57 | ||
58 | list_for_each_entry_rcu(e, &tcp_cong_list, list) { | |
59 | if (e->key == key) | |
60 | return e; | |
61 | } | |
62 | ||
63 | return NULL; | |
64 | } | |
65 | ||
317a76f9 | 66 | /* |
d08df601 | 67 | * Attach new congestion control algorithm to the list |
317a76f9 SH |
68 | * of available options. |
69 | */ | |
70 | int tcp_register_congestion_control(struct tcp_congestion_ops *ca) | |
71 | { | |
72 | int ret = 0; | |
73 | ||
e9799183 FW |
74 | /* all algorithms must implement these */ |
75 | if (!ca->ssthresh || !ca->undo_cwnd || | |
76 | !(ca->cong_avoid || ca->cong_control)) { | |
afd46503 | 77 | pr_err("%s does not implement required ops\n", ca->name); |
317a76f9 SH |
78 | return -EINVAL; |
79 | } | |
80 | ||
c5c6a8ab DB |
81 | ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name)); |
82 | ||
317a76f9 | 83 | spin_lock(&tcp_cong_list_lock); |
c5c6a8ab DB |
84 | if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) { |
85 | pr_notice("%s already registered or non-unique key\n", | |
86 | ca->name); | |
317a76f9 SH |
87 | ret = -EEXIST; |
88 | } else { | |
3d2573f7 | 89 | list_add_tail_rcu(&ca->list, &tcp_cong_list); |
db2855ae | 90 | pr_debug("%s registered\n", ca->name); |
317a76f9 SH |
91 | } |
92 | spin_unlock(&tcp_cong_list_lock); | |
93 | ||
94 | return ret; | |
95 | } | |
96 | EXPORT_SYMBOL_GPL(tcp_register_congestion_control); | |
97 | ||
98 | /* | |
99 | * Remove congestion control algorithm, called from | |
100 | * the module's remove function. Module ref counts are used | |
101 | * to ensure that this can't be done till all sockets using | |
102 | * that method are closed. | |
103 | */ | |
104 | void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) | |
105 | { | |
106 | spin_lock(&tcp_cong_list_lock); | |
107 | list_del_rcu(&ca->list); | |
108 | spin_unlock(&tcp_cong_list_lock); | |
c5c6a8ab DB |
109 | |
110 | /* Wait for outstanding readers to complete before the | |
111 | * module gets removed entirely. | |
112 | * | |
113 | * A try_module_get() should fail by now as our module is | |
114 | * in "going" state since no refs are held anymore and | |
115 | * module_exit() handler being called. | |
116 | */ | |
117 | synchronize_rcu(); | |
317a76f9 SH |
118 | } |
119 | EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); | |
120 | ||
6670e152 | 121 | u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca) |
c5c6a8ab DB |
122 | { |
123 | const struct tcp_congestion_ops *ca; | |
c3a8d947 | 124 | u32 key = TCP_CA_UNSPEC; |
c5c6a8ab DB |
125 | |
126 | might_sleep(); | |
127 | ||
128 | rcu_read_lock(); | |
6670e152 | 129 | ca = tcp_ca_find_autoload(net, name); |
c3a8d947 DB |
130 | if (ca) { |
131 | key = ca->key; | |
132 | *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; | |
133 | } | |
c5c6a8ab DB |
134 | rcu_read_unlock(); |
135 | ||
136 | return key; | |
137 | } | |
138 | EXPORT_SYMBOL_GPL(tcp_ca_get_key_by_name); | |
139 | ||
140 | char *tcp_ca_get_name_by_key(u32 key, char *buffer) | |
141 | { | |
142 | const struct tcp_congestion_ops *ca; | |
143 | char *ret = NULL; | |
144 | ||
145 | rcu_read_lock(); | |
146 | ca = tcp_ca_find_key(key); | |
147 | if (ca) | |
148 | ret = strncpy(buffer, ca->name, | |
149 | TCP_CA_NAME_MAX); | |
150 | rcu_read_unlock(); | |
151 | ||
152 | return ret; | |
153 | } | |
154 | EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key); | |
155 | ||
317a76f9 | 156 | /* Assign choice of congestion control. */ |
55d8694f | 157 | void tcp_assign_congestion_control(struct sock *sk) |
317a76f9 | 158 | { |
6670e152 | 159 | struct net *net = sock_net(sk); |
6687e988 | 160 | struct inet_connection_sock *icsk = inet_csk(sk); |
6670e152 | 161 | const struct tcp_congestion_ops *ca; |
317a76f9 | 162 | |
55d8694f | 163 | rcu_read_lock(); |
6670e152 SH |
164 | ca = rcu_dereference(net->ipv4.tcp_congestion_control); |
165 | if (unlikely(!try_module_get(ca->owner))) | |
166 | ca = &tcp_reno; | |
167 | icsk->icsk_ca_ops = ca; | |
55d8694f FW |
168 | rcu_read_unlock(); |
169 | ||
6670e152 | 170 | memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); |
6ac705b1 ED |
171 | if (ca->flags & TCP_CONG_NEEDS_ECN) |
172 | INET_ECN_xmit(sk); | |
173 | else | |
174 | INET_ECN_dontxmit(sk); | |
55d8694f FW |
175 | } |
176 | ||
177 | void tcp_init_congestion_control(struct sock *sk) | |
178 | { | |
179 | const struct inet_connection_sock *icsk = inet_csk(sk); | |
317a76f9 | 180 | |
44abafc4 | 181 | tcp_sk(sk)->prior_ssthresh = 0; |
6687e988 ACM |
182 | if (icsk->icsk_ca_ops->init) |
183 | icsk->icsk_ca_ops->init(sk); | |
6ac705b1 ED |
184 | if (tcp_ca_needs_ecn(sk)) |
185 | INET_ECN_xmit(sk); | |
186 | else | |
187 | INET_ECN_dontxmit(sk); | |
317a76f9 SH |
188 | } |
189 | ||
ebfa00c5 SD |
190 | static void tcp_reinit_congestion_control(struct sock *sk, |
191 | const struct tcp_congestion_ops *ca) | |
29ba4fff DB |
192 | { |
193 | struct inet_connection_sock *icsk = inet_csk(sk); | |
194 | ||
195 | tcp_cleanup_congestion_control(sk); | |
196 | icsk->icsk_ca_ops = ca; | |
9f950415 | 197 | icsk->icsk_ca_setsockopt = 1; |
c1201444 | 198 | memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); |
29ba4fff | 199 | |
c1201444 | 200 | if (sk->sk_state != TCP_CLOSE) |
6ac705b1 | 201 | tcp_init_congestion_control(sk); |
29ba4fff DB |
202 | } |
203 | ||
317a76f9 | 204 | /* Manage refcounts on socket close. */ |
6687e988 | 205 | void tcp_cleanup_congestion_control(struct sock *sk) |
317a76f9 | 206 | { |
6687e988 ACM |
207 | struct inet_connection_sock *icsk = inet_csk(sk); |
208 | ||
209 | if (icsk->icsk_ca_ops->release) | |
210 | icsk->icsk_ca_ops->release(sk); | |
211 | module_put(icsk->icsk_ca_ops->owner); | |
317a76f9 SH |
212 | } |
213 | ||
214 | /* Used by sysctl to change default congestion control */ | |
6670e152 | 215 | int tcp_set_default_congestion_control(struct net *net, const char *name) |
317a76f9 SH |
216 | { |
217 | struct tcp_congestion_ops *ca; | |
6670e152 SH |
218 | const struct tcp_congestion_ops *prev; |
219 | int ret; | |
317a76f9 | 220 | |
6670e152 SH |
221 | rcu_read_lock(); |
222 | ca = tcp_ca_find_autoload(net, name); | |
223 | if (!ca) { | |
224 | ret = -ENOENT; | |
225 | } else if (!try_module_get(ca->owner)) { | |
226 | ret = -EBUSY; | |
227 | } else { | |
228 | prev = xchg(&net->ipv4.tcp_congestion_control, ca); | |
229 | if (prev) | |
230 | module_put(prev->owner); | |
317a76f9 | 231 | |
6670e152 | 232 | ca->flags |= TCP_CONG_NON_RESTRICTED; |
317a76f9 SH |
233 | ret = 0; |
234 | } | |
6670e152 | 235 | rcu_read_unlock(); |
317a76f9 SH |
236 | |
237 | return ret; | |
238 | } | |
239 | ||
b1736a71 SH |
240 | /* Set default value from kernel configuration at bootup */ |
241 | static int __init tcp_congestion_default(void) | |
242 | { | |
6670e152 SH |
243 | return tcp_set_default_congestion_control(&init_net, |
244 | CONFIG_DEFAULT_TCP_CONG); | |
b1736a71 SH |
245 | } |
246 | late_initcall(tcp_congestion_default); | |
247 | ||
3ff825b2 SH |
248 | /* Build string with list of available congestion control values */ |
249 | void tcp_get_available_congestion_control(char *buf, size_t maxlen) | |
250 | { | |
251 | struct tcp_congestion_ops *ca; | |
252 | size_t offs = 0; | |
253 | ||
254 | rcu_read_lock(); | |
255 | list_for_each_entry_rcu(ca, &tcp_cong_list, list) { | |
256 | offs += snprintf(buf + offs, maxlen - offs, | |
257 | "%s%s", | |
258 | offs == 0 ? "" : " ", ca->name); | |
3ff825b2 SH |
259 | } |
260 | rcu_read_unlock(); | |
261 | } | |
262 | ||
317a76f9 | 263 | /* Get current default congestion control */ |
6670e152 | 264 | void tcp_get_default_congestion_control(struct net *net, char *name) |
317a76f9 | 265 | { |
6670e152 | 266 | const struct tcp_congestion_ops *ca; |
317a76f9 SH |
267 | |
268 | rcu_read_lock(); | |
6670e152 | 269 | ca = rcu_dereference(net->ipv4.tcp_congestion_control); |
317a76f9 SH |
270 | strncpy(name, ca->name, TCP_CA_NAME_MAX); |
271 | rcu_read_unlock(); | |
272 | } | |
273 | ||
ce7bc3bf SH |
274 | /* Built list of non-restricted congestion control values */ |
275 | void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) | |
276 | { | |
277 | struct tcp_congestion_ops *ca; | |
278 | size_t offs = 0; | |
279 | ||
280 | *buf = '\0'; | |
281 | rcu_read_lock(); | |
282 | list_for_each_entry_rcu(ca, &tcp_cong_list, list) { | |
164891aa | 283 | if (!(ca->flags & TCP_CONG_NON_RESTRICTED)) |
ce7bc3bf SH |
284 | continue; |
285 | offs += snprintf(buf + offs, maxlen - offs, | |
286 | "%s%s", | |
287 | offs == 0 ? "" : " ", ca->name); | |
ce7bc3bf SH |
288 | } |
289 | rcu_read_unlock(); | |
290 | } | |
291 | ||
292 | /* Change list of non-restricted congestion control */ | |
293 | int tcp_set_allowed_congestion_control(char *val) | |
294 | { | |
295 | struct tcp_congestion_ops *ca; | |
c34186ed | 296 | char *saved_clone, *clone, *name; |
ce7bc3bf SH |
297 | int ret = 0; |
298 | ||
c34186ed | 299 | saved_clone = clone = kstrdup(val, GFP_USER); |
ce7bc3bf SH |
300 | if (!clone) |
301 | return -ENOMEM; | |
302 | ||
303 | spin_lock(&tcp_cong_list_lock); | |
304 | /* pass 1 check for bad entries */ | |
305 | while ((name = strsep(&clone, " ")) && *name) { | |
306 | ca = tcp_ca_find(name); | |
307 | if (!ca) { | |
308 | ret = -ENOENT; | |
309 | goto out; | |
310 | } | |
311 | } | |
312 | ||
164891aa | 313 | /* pass 2 clear old values */ |
ce7bc3bf | 314 | list_for_each_entry_rcu(ca, &tcp_cong_list, list) |
164891aa | 315 | ca->flags &= ~TCP_CONG_NON_RESTRICTED; |
ce7bc3bf SH |
316 | |
317 | /* pass 3 mark as allowed */ | |
318 | while ((name = strsep(&val, " ")) && *name) { | |
319 | ca = tcp_ca_find(name); | |
320 | WARN_ON(!ca); | |
321 | if (ca) | |
164891aa | 322 | ca->flags |= TCP_CONG_NON_RESTRICTED; |
ce7bc3bf SH |
323 | } |
324 | out: | |
325 | spin_unlock(&tcp_cong_list_lock); | |
c34186ed | 326 | kfree(saved_clone); |
ce7bc3bf SH |
327 | |
328 | return ret; | |
329 | } | |
330 | ||
91b5b21c LB |
331 | /* Change congestion control for socket. If load is false, then it is the |
332 | * responsibility of the caller to call tcp_init_congestion_control or | |
333 | * tcp_reinit_congestion_control (if the current congestion control was | |
334 | * already initialized. | |
335 | */ | |
8d650cde ED |
336 | int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, |
337 | bool reinit, bool cap_net_admin) | |
5f8ef48d | 338 | { |
6687e988 | 339 | struct inet_connection_sock *icsk = inet_csk(sk); |
c5c6a8ab | 340 | const struct tcp_congestion_ops *ca; |
5f8ef48d SH |
341 | int err = 0; |
342 | ||
c5c6a8ab DB |
343 | if (icsk->icsk_ca_dst_locked) |
344 | return -EPERM; | |
4d4d3d1e | 345 | |
c5c6a8ab | 346 | rcu_read_lock(); |
91b5b21c LB |
347 | if (!load) |
348 | ca = tcp_ca_find(name); | |
349 | else | |
6670e152 SH |
350 | ca = tcp_ca_find_autoload(sock_net(sk), name); |
351 | ||
c5c6a8ab | 352 | /* No change asking for existing value */ |
9f950415 NC |
353 | if (ca == icsk->icsk_ca_ops) { |
354 | icsk->icsk_ca_setsockopt = 1; | |
5f8ef48d | 355 | goto out; |
9f950415 | 356 | } |
6670e152 | 357 | |
91b5b21c | 358 | if (!ca) { |
5f8ef48d | 359 | err = -ENOENT; |
91b5b21c | 360 | } else if (!load) { |
ebfa00c5 SD |
361 | const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops; |
362 | ||
363 | if (try_module_get(ca->owner)) { | |
364 | if (reinit) { | |
365 | tcp_reinit_congestion_control(sk, ca); | |
366 | } else { | |
367 | icsk->icsk_ca_ops = ca; | |
368 | module_put(old_ca->owner); | |
369 | } | |
370 | } else { | |
91b5b21c | 371 | err = -EBUSY; |
ebfa00c5 | 372 | } |
8d650cde | 373 | } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) { |
ce7bc3bf | 374 | err = -EPERM; |
91b5b21c | 375 | } else if (!try_module_get(ca->owner)) { |
5f8ef48d | 376 | err = -EBUSY; |
91b5b21c | 377 | } else { |
29ba4fff | 378 | tcp_reinit_congestion_control(sk, ca); |
91b5b21c | 379 | } |
5f8ef48d SH |
380 | out: |
381 | rcu_read_unlock(); | |
382 | return err; | |
383 | } | |
384 | ||
9f9843a7 YC |
385 | /* Slow start is used when congestion window is no greater than the slow start |
386 | * threshold. We base on RFC2581 and also handle stretch ACKs properly. | |
387 | * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but | |
388 | * something better;) a packet is only considered (s)acked in its entirety to | |
389 | * defend the ACK attacks described in the RFC. Slow start processes a stretch | |
390 | * ACK of degree N as if N acks of degree 1 are received back to back except | |
391 | * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and | |
392 | * returns the leftover acks to adjust cwnd in congestion avoidance mode. | |
40efc6fa | 393 | */ |
e73ebb08 | 394 | u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) |
40efc6fa | 395 | { |
76174004 | 396 | u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh); |
a02ba041 | 397 | |
e73ebb08 | 398 | acked -= cwnd - tp->snd_cwnd; |
9f9843a7 | 399 | tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); |
e73ebb08 NC |
400 | |
401 | return acked; | |
40efc6fa SH |
402 | } |
403 | EXPORT_SYMBOL_GPL(tcp_slow_start); | |
404 | ||
814d488c NC |
405 | /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w), |
406 | * for every packet that was ACKed. | |
407 | */ | |
e73ebb08 | 408 | void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) |
758ce5c8 | 409 | { |
9949afa4 NC |
410 | /* If credits accumulated at a higher w, apply them gently now. */ |
411 | if (tp->snd_cwnd_cnt >= w) { | |
412 | tp->snd_cwnd_cnt = 0; | |
413 | tp->snd_cwnd++; | |
414 | } | |
415 | ||
814d488c | 416 | tp->snd_cwnd_cnt += acked; |
758ce5c8 | 417 | if (tp->snd_cwnd_cnt >= w) { |
814d488c NC |
418 | u32 delta = tp->snd_cwnd_cnt / w; |
419 | ||
420 | tp->snd_cwnd_cnt -= delta * w; | |
421 | tp->snd_cwnd += delta; | |
758ce5c8 | 422 | } |
814d488c | 423 | tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp); |
758ce5c8 IJ |
424 | } |
425 | EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); | |
426 | ||
317a76f9 SH |
427 | /* |
428 | * TCP Reno congestion control | |
429 | * This is special case used for fallback as well. | |
430 | */ | |
431 | /* This is Jacobson's slow start and congestion avoidance. | |
432 | * SIGCOMM '88, p. 328. | |
433 | */ | |
24901551 | 434 | void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) |
317a76f9 | 435 | { |
6687e988 ACM |
436 | struct tcp_sock *tp = tcp_sk(sk); |
437 | ||
24901551 | 438 | if (!tcp_is_cwnd_limited(sk)) |
317a76f9 SH |
439 | return; |
440 | ||
7faffa1c | 441 | /* In "safe" area, increase. */ |
071d5080 | 442 | if (tcp_in_slow_start(tp)) { |
c22bdca9 NC |
443 | acked = tcp_slow_start(tp, acked); |
444 | if (!acked) | |
445 | return; | |
446 | } | |
e905a9ed | 447 | /* In dangerous area, increase slowly. */ |
c22bdca9 | 448 | tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked); |
317a76f9 SH |
449 | } |
450 | EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); | |
451 | ||
452 | /* Slow start threshold is half the congestion window (min 2) */ | |
6687e988 | 453 | u32 tcp_reno_ssthresh(struct sock *sk) |
317a76f9 | 454 | { |
6687e988 | 455 | const struct tcp_sock *tp = tcp_sk(sk); |
688d1945 | 456 | |
317a76f9 SH |
457 | return max(tp->snd_cwnd >> 1U, 2U); |
458 | } | |
459 | EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); | |
460 | ||
e9799183 FW |
461 | u32 tcp_reno_undo_cwnd(struct sock *sk) |
462 | { | |
463 | const struct tcp_sock *tp = tcp_sk(sk); | |
464 | ||
4faf7839 | 465 | return max(tp->snd_cwnd, tp->prior_cwnd); |
e9799183 FW |
466 | } |
467 | EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd); | |
468 | ||
317a76f9 | 469 | struct tcp_congestion_ops tcp_reno = { |
164891aa | 470 | .flags = TCP_CONG_NON_RESTRICTED, |
317a76f9 SH |
471 | .name = "reno", |
472 | .owner = THIS_MODULE, | |
473 | .ssthresh = tcp_reno_ssthresh, | |
474 | .cong_avoid = tcp_reno_cong_avoid, | |
e9799183 | 475 | .undo_cwnd = tcp_reno_undo_cwnd, |
317a76f9 | 476 | }; |