Commit | Line | Data |
---|---|---|
046f6fd5 THJ |
1 | // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause |
2 | ||
3 | /* COMMON Applications Kept Enhanced (CAKE) discipline | |
4 | * | |
5 | * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com> | |
6 | * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk> | |
7 | * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com> | |
8 | * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de> | |
9 | * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk> | |
10 | * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au> | |
11 | * | |
12 | * The CAKE Principles: | |
13 | * (or, how to have your cake and eat it too) | |
14 | * | |
15 | * This is a combination of several shaping, AQM and FQ techniques into one | |
16 | * easy-to-use package: | |
17 | * | |
18 | * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE | |
19 | * equipment and bloated MACs. This operates in deficit mode (as in sch_fq), | |
20 | * eliminating the need for any sort of burst parameter (eg. token bucket | |
21 | * depth). Burst support is limited to that necessary to overcome scheduling | |
22 | * latency. | |
23 | * | |
24 | * - A Diffserv-aware priority queue, giving more priority to certain classes, | |
25 | * up to a specified fraction of bandwidth. Above that bandwidth threshold, | |
26 | * the priority is reduced to avoid starving other tins. | |
27 | * | |
28 | * - Each priority tin has a separate Flow Queue system, to isolate traffic | |
29 | * flows from each other. This prevents a burst on one flow from increasing | |
30 | * the delay to another. Flows are distributed to queues using a | |
31 | * set-associative hash function. | |
32 | * | |
33 | * - Each queue is actively managed by Cobalt, which is a combination of the | |
34 | * Codel and Blue AQM algorithms. This serves flows fairly, and signals | |
35 | * congestion early via ECN (if available) and/or packet drops, to keep | |
36 | * latency low. The codel parameters are auto-tuned based on the bandwidth | |
37 | * setting, as is necessary at low bandwidths. | |
38 | * | |
39 | * The configuration parameters are kept deliberately simple for ease of use. | |
40 | * Everything has sane defaults. Complete generality of configuration is *not* | |
41 | * a goal. | |
42 | * | |
43 | * The priority queue operates according to a weighted DRR scheme, combined with | |
44 | * a bandwidth tracker which reuses the shaper logic to detect which side of the | |
45 | * bandwidth sharing threshold the tin is operating. This determines whether a | |
46 | * priority-based weight (high) or a bandwidth-based weight (low) is used for | |
47 | * that tin in the current pass. | |
48 | * | |
49 | * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly | |
50 | * granted us permission to leverage. | |
51 | */ | |
52 | ||
53 | #include <linux/module.h> | |
54 | #include <linux/types.h> | |
55 | #include <linux/kernel.h> | |
56 | #include <linux/jiffies.h> | |
57 | #include <linux/string.h> | |
58 | #include <linux/in.h> | |
59 | #include <linux/errno.h> | |
60 | #include <linux/init.h> | |
61 | #include <linux/skbuff.h> | |
62 | #include <linux/jhash.h> | |
63 | #include <linux/slab.h> | |
64 | #include <linux/vmalloc.h> | |
65 | #include <linux/reciprocal_div.h> | |
66 | #include <net/netlink.h> | |
046f6fd5 THJ |
67 | #include <linux/if_vlan.h> |
68 | #include <net/pkt_sched.h> | |
69 | #include <net/pkt_cls.h> | |
70 | #include <net/tcp.h> | |
71 | #include <net/flow_dissector.h> | |
72 | ||
ea825115 THJ |
73 | #if IS_ENABLED(CONFIG_NF_CONNTRACK) |
74 | #include <net/netfilter/nf_conntrack_core.h> | |
75 | #endif | |
76 | ||
046f6fd5 THJ |
77 | #define CAKE_SET_WAYS (8) |
78 | #define CAKE_MAX_TINS (8) | |
79 | #define CAKE_QUEUES (1024) | |
80 | #define CAKE_FLOW_MASK 63 | |
81 | #define CAKE_FLOW_NAT_FLAG 64 | |
82 | ||
83 | /* struct cobalt_params - contains codel and blue parameters | |
84 | * @interval: codel initial drop rate | |
85 | * @target: maximum persistent sojourn time & blue update rate | |
86 | * @mtu_time: serialisation delay of maximum-size packet | |
87 | * @p_inc: increment of blue drop probability (0.32 fxp) | |
88 | * @p_dec: decrement of blue drop probability (0.32 fxp) | |
89 | */ | |
90 | struct cobalt_params { | |
91 | u64 interval; | |
92 | u64 target; | |
93 | u64 mtu_time; | |
94 | u32 p_inc; | |
95 | u32 p_dec; | |
96 | }; | |
97 | ||
98 | /* struct cobalt_vars - contains codel and blue variables | |
99 | * @count: codel dropping frequency | |
100 | * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 | |
101 | * @drop_next: time to drop next packet, or when we dropped last | |
102 | * @blue_timer: Blue time to next drop | |
103 | * @p_drop: BLUE drop probability (0.32 fxp) | |
104 | * @dropping: set if in dropping state | |
105 | * @ecn_marked: set if marked | |
106 | */ | |
107 | struct cobalt_vars { | |
108 | u32 count; | |
109 | u32 rec_inv_sqrt; | |
110 | ktime_t drop_next; | |
111 | ktime_t blue_timer; | |
112 | u32 p_drop; | |
113 | bool dropping; | |
114 | bool ecn_marked; | |
115 | }; | |
116 | ||
117 | enum { | |
118 | CAKE_SET_NONE = 0, | |
119 | CAKE_SET_SPARSE, | |
120 | CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */ | |
121 | CAKE_SET_BULK, | |
122 | CAKE_SET_DECAYING | |
123 | }; | |
124 | ||
125 | struct cake_flow { | |
126 | /* this stuff is all needed per-flow at dequeue time */ | |
127 | struct sk_buff *head; | |
128 | struct sk_buff *tail; | |
129 | struct list_head flowchain; | |
130 | s32 deficit; | |
131 | u32 dropped; | |
132 | struct cobalt_vars cvars; | |
133 | u16 srchost; /* index into cake_host table */ | |
134 | u16 dsthost; | |
135 | u8 set; | |
136 | }; /* please try to keep this structure <= 64 bytes */ | |
137 | ||
138 | struct cake_host { | |
139 | u32 srchost_tag; | |
140 | u32 dsthost_tag; | |
71263992 GA |
141 | u16 srchost_bulk_flow_count; |
142 | u16 dsthost_bulk_flow_count; | |
046f6fd5 THJ |
143 | }; |
144 | ||
145 | struct cake_heap_entry { | |
146 | u16 t:3, b:10; | |
147 | }; | |
148 | ||
149 | struct cake_tin_data { | |
150 | struct cake_flow flows[CAKE_QUEUES]; | |
151 | u32 backlogs[CAKE_QUEUES]; | |
152 | u32 tags[CAKE_QUEUES]; /* for set association */ | |
153 | u16 overflow_idx[CAKE_QUEUES]; | |
154 | struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */ | |
155 | u16 flow_quantum; | |
156 | ||
157 | struct cobalt_params cparams; | |
158 | u32 drop_overlimit; | |
159 | u16 bulk_flow_count; | |
160 | u16 sparse_flow_count; | |
161 | u16 decaying_flow_count; | |
162 | u16 unresponsive_flow_count; | |
163 | ||
164 | u32 max_skblen; | |
165 | ||
166 | struct list_head new_flows; | |
167 | struct list_head old_flows; | |
168 | struct list_head decaying_flows; | |
169 | ||
170 | /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ | |
171 | ktime_t time_next_packet; | |
172 | u64 tin_rate_ns; | |
173 | u64 tin_rate_bps; | |
174 | u16 tin_rate_shft; | |
175 | ||
cbd22f17 | 176 | u16 tin_quantum; |
046f6fd5 THJ |
177 | s32 tin_deficit; |
178 | u32 tin_backlog; | |
179 | u32 tin_dropped; | |
180 | u32 tin_ecn_mark; | |
181 | ||
182 | u32 packets; | |
183 | u64 bytes; | |
184 | ||
185 | u32 ack_drops; | |
186 | ||
187 | /* moving averages */ | |
188 | u64 avge_delay; | |
189 | u64 peak_delay; | |
190 | u64 base_delay; | |
191 | ||
192 | /* hash function stats */ | |
193 | u32 way_directs; | |
194 | u32 way_hits; | |
195 | u32 way_misses; | |
196 | u32 way_collisions; | |
197 | }; /* number of tins is small, so size of this struct doesn't matter much */ | |
198 | ||
199 | struct cake_sched_data { | |
200 | struct tcf_proto __rcu *filter_list; /* optional external classifier */ | |
201 | struct tcf_block *block; | |
202 | struct cake_tin_data *tins; | |
203 | ||
204 | struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS]; | |
205 | u16 overflow_timeout; | |
206 | ||
207 | u16 tin_cnt; | |
208 | u8 tin_mode; | |
209 | u8 flow_mode; | |
210 | u8 ack_filter; | |
211 | u8 atm_mode; | |
212 | ||
eab2fc82 THJ |
213 | u32 fwmark_mask; |
214 | u16 fwmark_shft; | |
215 | ||
046f6fd5 THJ |
216 | /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ |
217 | u16 rate_shft; | |
218 | ktime_t time_next_packet; | |
219 | ktime_t failsafe_next_packet; | |
220 | u64 rate_ns; | |
221 | u64 rate_bps; | |
222 | u16 rate_flags; | |
223 | s16 rate_overhead; | |
224 | u16 rate_mpu; | |
225 | u64 interval; | |
226 | u64 target; | |
227 | ||
228 | /* resource tracking */ | |
229 | u32 buffer_used; | |
230 | u32 buffer_max_used; | |
231 | u32 buffer_limit; | |
232 | u32 buffer_config_limit; | |
233 | ||
234 | /* indices for dequeue */ | |
235 | u16 cur_tin; | |
236 | u16 cur_flow; | |
237 | ||
238 | struct qdisc_watchdog watchdog; | |
239 | const u8 *tin_index; | |
240 | const u8 *tin_order; | |
241 | ||
242 | /* bandwidth capacity estimate */ | |
243 | ktime_t last_packet_time; | |
244 | ktime_t avg_window_begin; | |
245 | u64 avg_packet_interval; | |
246 | u64 avg_window_bytes; | |
247 | u64 avg_peak_bandwidth; | |
248 | ktime_t last_reconfig_time; | |
249 | ||
250 | /* packet length stats */ | |
251 | u32 avg_netoff; | |
252 | u16 max_netlen; | |
253 | u16 max_adjlen; | |
254 | u16 min_netlen; | |
255 | u16 min_adjlen; | |
256 | }; | |
257 | ||
258 | enum { | |
259 | CAKE_FLAG_OVERHEAD = BIT(0), | |
260 | CAKE_FLAG_AUTORATE_INGRESS = BIT(1), | |
261 | CAKE_FLAG_INGRESS = BIT(2), | |
262 | CAKE_FLAG_WASH = BIT(3), | |
eab2fc82 | 263 | CAKE_FLAG_SPLIT_GSO = BIT(4) |
046f6fd5 THJ |
264 | }; |
265 | ||
266 | /* COBALT operates the Codel and BLUE algorithms in parallel, in order to | |
267 | * obtain the best features of each. Codel is excellent on flows which | |
268 | * respond to congestion signals in a TCP-like way. BLUE is more effective on | |
269 | * unresponsive flows. | |
270 | */ | |
271 | ||
272 | struct cobalt_skb_cb { | |
273 | ktime_t enqueue_time; | |
a729b7f0 | 274 | u32 adjusted_len; |
046f6fd5 THJ |
275 | }; |
276 | ||
277 | static u64 us_to_ns(u64 us) | |
278 | { | |
279 | return us * NSEC_PER_USEC; | |
280 | } | |
281 | ||
282 | static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb) | |
283 | { | |
284 | qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb)); | |
285 | return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data; | |
286 | } | |
287 | ||
288 | static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb) | |
289 | { | |
290 | return get_cobalt_cb(skb)->enqueue_time; | |
291 | } | |
292 | ||
293 | static void cobalt_set_enqueue_time(struct sk_buff *skb, | |
294 | ktime_t now) | |
295 | { | |
296 | get_cobalt_cb(skb)->enqueue_time = now; | |
297 | } | |
298 | ||
299 | static u16 quantum_div[CAKE_QUEUES + 1] = {0}; | |
300 | ||
83f8fd69 THJ |
301 | /* Diffserv lookup tables */ |
302 | ||
303 | static const u8 precedence[] = { | |
304 | 0, 0, 0, 0, 0, 0, 0, 0, | |
305 | 1, 1, 1, 1, 1, 1, 1, 1, | |
306 | 2, 2, 2, 2, 2, 2, 2, 2, | |
307 | 3, 3, 3, 3, 3, 3, 3, 3, | |
308 | 4, 4, 4, 4, 4, 4, 4, 4, | |
309 | 5, 5, 5, 5, 5, 5, 5, 5, | |
310 | 6, 6, 6, 6, 6, 6, 6, 6, | |
311 | 7, 7, 7, 7, 7, 7, 7, 7, | |
312 | }; | |
313 | ||
314 | static const u8 diffserv8[] = { | |
b8392808 KDB |
315 | 2, 0, 1, 2, 4, 2, 2, 2, |
316 | 1, 2, 1, 2, 1, 2, 1, 2, | |
83f8fd69 THJ |
317 | 5, 2, 4, 2, 4, 2, 4, 2, |
318 | 3, 2, 3, 2, 3, 2, 3, 2, | |
319 | 6, 2, 3, 2, 3, 2, 3, 2, | |
320 | 6, 2, 2, 2, 6, 2, 6, 2, | |
321 | 7, 2, 2, 2, 2, 2, 2, 2, | |
322 | 7, 2, 2, 2, 2, 2, 2, 2, | |
323 | }; | |
324 | ||
325 | static const u8 diffserv4[] = { | |
b8392808 | 326 | 0, 1, 0, 0, 2, 0, 0, 0, |
83f8fd69 THJ |
327 | 1, 0, 0, 0, 0, 0, 0, 0, |
328 | 2, 0, 2, 0, 2, 0, 2, 0, | |
329 | 2, 0, 2, 0, 2, 0, 2, 0, | |
330 | 3, 0, 2, 0, 2, 0, 2, 0, | |
331 | 3, 0, 0, 0, 3, 0, 3, 0, | |
332 | 3, 0, 0, 0, 0, 0, 0, 0, | |
333 | 3, 0, 0, 0, 0, 0, 0, 0, | |
334 | }; | |
335 | ||
336 | static const u8 diffserv3[] = { | |
b8392808 | 337 | 0, 1, 0, 0, 2, 0, 0, 0, |
83f8fd69 THJ |
338 | 1, 0, 0, 0, 0, 0, 0, 0, |
339 | 0, 0, 0, 0, 0, 0, 0, 0, | |
340 | 0, 0, 0, 0, 0, 0, 0, 0, | |
341 | 0, 0, 0, 0, 0, 0, 0, 0, | |
342 | 0, 0, 0, 0, 2, 0, 2, 0, | |
343 | 2, 0, 0, 0, 0, 0, 0, 0, | |
344 | 2, 0, 0, 0, 0, 0, 0, 0, | |
345 | }; | |
346 | ||
347 | static const u8 besteffort[] = { | |
348 | 0, 0, 0, 0, 0, 0, 0, 0, | |
349 | 0, 0, 0, 0, 0, 0, 0, 0, | |
350 | 0, 0, 0, 0, 0, 0, 0, 0, | |
351 | 0, 0, 0, 0, 0, 0, 0, 0, | |
352 | 0, 0, 0, 0, 0, 0, 0, 0, | |
353 | 0, 0, 0, 0, 0, 0, 0, 0, | |
354 | 0, 0, 0, 0, 0, 0, 0, 0, | |
355 | 0, 0, 0, 0, 0, 0, 0, 0, | |
356 | }; | |
357 | ||
358 | /* tin priority order for stats dumping */ | |
359 | ||
360 | static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7}; | |
361 | static const u8 bulk_order[] = {1, 0, 2, 3}; | |
362 | ||
046f6fd5 THJ |
363 | #define REC_INV_SQRT_CACHE (16) |
364 | static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0}; | |
365 | ||
366 | /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots | |
367 | * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) | |
368 | * | |
369 | * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 | |
370 | */ | |
371 | ||
372 | static void cobalt_newton_step(struct cobalt_vars *vars) | |
373 | { | |
374 | u32 invsqrt, invsqrt2; | |
375 | u64 val; | |
376 | ||
377 | invsqrt = vars->rec_inv_sqrt; | |
378 | invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; | |
379 | val = (3LL << 32) - ((u64)vars->count * invsqrt2); | |
380 | ||
381 | val >>= 2; /* avoid overflow in following multiply */ | |
382 | val = (val * invsqrt) >> (32 - 2 + 1); | |
383 | ||
384 | vars->rec_inv_sqrt = val; | |
385 | } | |
386 | ||
387 | static void cobalt_invsqrt(struct cobalt_vars *vars) | |
388 | { | |
389 | if (vars->count < REC_INV_SQRT_CACHE) | |
390 | vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count]; | |
391 | else | |
392 | cobalt_newton_step(vars); | |
393 | } | |
394 | ||
395 | /* There is a big difference in timing between the accurate values placed in | |
396 | * the cache and the approximations given by a single Newton step for small | |
397 | * count values, particularly when stepping from count 1 to 2 or vice versa. | |
398 | * Above 16, a single Newton step gives sufficient accuracy in either | |
399 | * direction, given the precision stored. | |
400 | * | |
401 | * The magnitude of the error when stepping up to count 2 is such as to give | |
402 | * the value that *should* have been produced at count 4. | |
403 | */ | |
404 | ||
405 | static void cobalt_cache_init(void) | |
406 | { | |
407 | struct cobalt_vars v; | |
408 | ||
409 | memset(&v, 0, sizeof(v)); | |
410 | v.rec_inv_sqrt = ~0U; | |
411 | cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt; | |
412 | ||
413 | for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) { | |
414 | cobalt_newton_step(&v); | |
415 | cobalt_newton_step(&v); | |
416 | cobalt_newton_step(&v); | |
417 | cobalt_newton_step(&v); | |
418 | ||
419 | cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt; | |
420 | } | |
421 | } | |
422 | ||
423 | static void cobalt_vars_init(struct cobalt_vars *vars) | |
424 | { | |
425 | memset(vars, 0, sizeof(*vars)); | |
426 | ||
427 | if (!cobalt_rec_inv_sqrt_cache[0]) { | |
428 | cobalt_cache_init(); | |
429 | cobalt_rec_inv_sqrt_cache[0] = ~0; | |
430 | } | |
431 | } | |
432 | ||
433 | /* CoDel control_law is t + interval/sqrt(count) | |
434 | * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid | |
435 | * both sqrt() and divide operation. | |
436 | */ | |
437 | static ktime_t cobalt_control(ktime_t t, | |
438 | u64 interval, | |
439 | u32 rec_inv_sqrt) | |
440 | { | |
441 | return ktime_add_ns(t, reciprocal_scale(interval, | |
442 | rec_inv_sqrt)); | |
443 | } | |
444 | ||
445 | /* Call this when a packet had to be dropped due to queue overflow. Returns | |
446 | * true if the BLUE state was quiescent before but active after this call. | |
447 | */ | |
448 | static bool cobalt_queue_full(struct cobalt_vars *vars, | |
449 | struct cobalt_params *p, | |
450 | ktime_t now) | |
451 | { | |
452 | bool up = false; | |
453 | ||
454 | if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { | |
455 | up = !vars->p_drop; | |
456 | vars->p_drop += p->p_inc; | |
457 | if (vars->p_drop < p->p_inc) | |
458 | vars->p_drop = ~0; | |
459 | vars->blue_timer = now; | |
460 | } | |
461 | vars->dropping = true; | |
462 | vars->drop_next = now; | |
463 | if (!vars->count) | |
464 | vars->count = 1; | |
465 | ||
466 | return up; | |
467 | } | |
468 | ||
469 | /* Call this when the queue was serviced but turned out to be empty. Returns | |
470 | * true if the BLUE state was active before but quiescent after this call. | |
471 | */ | |
472 | static bool cobalt_queue_empty(struct cobalt_vars *vars, | |
473 | struct cobalt_params *p, | |
474 | ktime_t now) | |
475 | { | |
476 | bool down = false; | |
477 | ||
478 | if (vars->p_drop && | |
479 | ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { | |
480 | if (vars->p_drop < p->p_dec) | |
481 | vars->p_drop = 0; | |
482 | else | |
483 | vars->p_drop -= p->p_dec; | |
484 | vars->blue_timer = now; | |
485 | down = !vars->p_drop; | |
486 | } | |
487 | vars->dropping = false; | |
488 | ||
489 | if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) { | |
490 | vars->count--; | |
491 | cobalt_invsqrt(vars); | |
492 | vars->drop_next = cobalt_control(vars->drop_next, | |
493 | p->interval, | |
494 | vars->rec_inv_sqrt); | |
495 | } | |
496 | ||
497 | return down; | |
498 | } | |
499 | ||
500 | /* Call this with a freshly dequeued packet for possible congestion marking. | |
501 | * Returns true as an instruction to drop the packet, false for delivery. | |
502 | */ | |
503 | static bool cobalt_should_drop(struct cobalt_vars *vars, | |
504 | struct cobalt_params *p, | |
505 | ktime_t now, | |
7298de9c THJ |
506 | struct sk_buff *skb, |
507 | u32 bulk_flows) | |
046f6fd5 THJ |
508 | { |
509 | bool next_due, over_target, drop = false; | |
510 | ktime_t schedule; | |
511 | u64 sojourn; | |
512 | ||
513 | /* The 'schedule' variable records, in its sign, whether 'now' is before or | |
514 | * after 'drop_next'. This allows 'drop_next' to be updated before the next | |
515 | * scheduling decision is actually branched, without destroying that | |
516 | * information. Similarly, the first 'schedule' value calculated is preserved | |
517 | * in the boolean 'next_due'. | |
518 | * | |
519 | * As for 'drop_next', we take advantage of the fact that 'interval' is both | |
520 | * the delay between first exceeding 'target' and the first signalling event, | |
521 | * *and* the scaling factor for the signalling frequency. It's therefore very | |
522 | * natural to use a single mechanism for both purposes, and eliminates a | |
523 | * significant amount of reference Codel's spaghetti code. To help with this, | |
524 | * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close | |
525 | * as possible to 1.0 in fixed-point. | |
526 | */ | |
527 | ||
528 | sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); | |
529 | schedule = ktime_sub(now, vars->drop_next); | |
530 | over_target = sojourn > p->target && | |
7298de9c | 531 | sojourn > p->mtu_time * bulk_flows * 2 && |
046f6fd5 THJ |
532 | sojourn > p->mtu_time * 4; |
533 | next_due = vars->count && ktime_to_ns(schedule) >= 0; | |
534 | ||
535 | vars->ecn_marked = false; | |
536 | ||
537 | if (over_target) { | |
538 | if (!vars->dropping) { | |
539 | vars->dropping = true; | |
540 | vars->drop_next = cobalt_control(now, | |
541 | p->interval, | |
542 | vars->rec_inv_sqrt); | |
543 | } | |
544 | if (!vars->count) | |
545 | vars->count = 1; | |
546 | } else if (vars->dropping) { | |
547 | vars->dropping = false; | |
548 | } | |
549 | ||
550 | if (next_due && vars->dropping) { | |
551 | /* Use ECN mark if possible, otherwise drop */ | |
552 | drop = !(vars->ecn_marked = INET_ECN_set_ce(skb)); | |
553 | ||
554 | vars->count++; | |
555 | if (!vars->count) | |
556 | vars->count--; | |
557 | cobalt_invsqrt(vars); | |
558 | vars->drop_next = cobalt_control(vars->drop_next, | |
559 | p->interval, | |
560 | vars->rec_inv_sqrt); | |
561 | schedule = ktime_sub(now, vars->drop_next); | |
562 | } else { | |
563 | while (next_due) { | |
564 | vars->count--; | |
565 | cobalt_invsqrt(vars); | |
566 | vars->drop_next = cobalt_control(vars->drop_next, | |
567 | p->interval, | |
568 | vars->rec_inv_sqrt); | |
569 | schedule = ktime_sub(now, vars->drop_next); | |
570 | next_due = vars->count && ktime_to_ns(schedule) >= 0; | |
571 | } | |
572 | } | |
573 | ||
574 | /* Simple BLUE implementation. Lack of ECN is deliberate. */ | |
575 | if (vars->p_drop) | |
a251c17a | 576 | drop |= (get_random_u32() < vars->p_drop); |
046f6fd5 THJ |
577 | |
578 | /* Overload the drop_next field as an activity timeout */ | |
579 | if (!vars->count) | |
580 | vars->drop_next = ktime_add_ns(now, p->interval); | |
581 | else if (ktime_to_ns(schedule) > 0 && !drop) | |
582 | vars->drop_next = now; | |
583 | ||
584 | return drop; | |
585 | } | |
586 | ||
b0c19ed6 | 587 | static bool cake_update_flowkeys(struct flow_keys *keys, |
ea825115 THJ |
588 | const struct sk_buff *skb) |
589 | { | |
590 | #if IS_ENABLED(CONFIG_NF_CONNTRACK) | |
591 | struct nf_conntrack_tuple tuple = {}; | |
b0c19ed6 THJ |
592 | bool rev = !skb->_nfct, upd = false; |
593 | __be32 ip; | |
ea825115 | 594 | |
d7bf2ebe | 595 | if (skb_protocol(skb, true) != htons(ETH_P_IP)) |
b0c19ed6 | 596 | return false; |
ea825115 THJ |
597 | |
598 | if (!nf_ct_get_tuple_skb(&tuple, skb)) | |
b0c19ed6 | 599 | return false; |
ea825115 | 600 | |
b0c19ed6 THJ |
601 | ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; |
602 | if (ip != keys->addrs.v4addrs.src) { | |
603 | keys->addrs.v4addrs.src = ip; | |
604 | upd = true; | |
605 | } | |
606 | ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; | |
607 | if (ip != keys->addrs.v4addrs.dst) { | |
608 | keys->addrs.v4addrs.dst = ip; | |
609 | upd = true; | |
610 | } | |
ea825115 THJ |
611 | |
612 | if (keys->ports.ports) { | |
b0c19ed6 THJ |
613 | __be16 port; |
614 | ||
615 | port = rev ? tuple.dst.u.all : tuple.src.u.all; | |
616 | if (port != keys->ports.src) { | |
617 | keys->ports.src = port; | |
618 | upd = true; | |
619 | } | |
620 | port = rev ? tuple.src.u.all : tuple.dst.u.all; | |
621 | if (port != keys->ports.dst) { | |
622 | port = keys->ports.dst; | |
623 | upd = true; | |
624 | } | |
ea825115 | 625 | } |
b0c19ed6 THJ |
626 | return upd; |
627 | #else | |
628 | return false; | |
ea825115 THJ |
629 | #endif |
630 | } | |
631 | ||
046f6fd5 THJ |
632 | /* Cake has several subtle multiple bit settings. In these cases you |
633 | * would be matching triple isolate mode as well. | |
634 | */ | |
635 | ||
636 | static bool cake_dsrc(int flow_mode) | |
637 | { | |
638 | return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC; | |
639 | } | |
640 | ||
641 | static bool cake_ddst(int flow_mode) | |
642 | { | |
643 | return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; | |
644 | } | |
645 | ||
646 | static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, | |
93cfb6c1 | 647 | int flow_mode, u16 flow_override, u16 host_override) |
046f6fd5 | 648 | { |
b0c19ed6 THJ |
649 | bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS)); |
650 | bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS)); | |
651 | bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG); | |
93cfb6c1 | 652 | u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0; |
046f6fd5 THJ |
653 | u16 reduced_hash, srchost_idx, dsthost_idx; |
654 | struct flow_keys keys, host_keys; | |
b0c19ed6 | 655 | bool use_skbhash = skb->l4_hash; |
046f6fd5 THJ |
656 | |
657 | if (unlikely(flow_mode == CAKE_FLOW_NONE)) | |
658 | return 0; | |
659 | ||
b0c19ed6 THJ |
660 | /* If both overrides are set, or we can use the SKB hash and nat mode is |
661 | * disabled, we can skip packet dissection entirely. If nat mode is | |
662 | * enabled there's another check below after doing the conntrack lookup. | |
663 | */ | |
664 | if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts) | |
93cfb6c1 THJ |
665 | goto skip_hash; |
666 | ||
046f6fd5 THJ |
667 | skb_flow_dissect_flow_keys(skb, &keys, |
668 | FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); | |
669 | ||
b0c19ed6 THJ |
670 | /* Don't use the SKB hash if we change the lookup keys from conntrack */ |
671 | if (nat_enabled && cake_update_flowkeys(&keys, skb)) | |
672 | use_skbhash = false; | |
673 | ||
674 | /* If we can still use the SKB hash and don't need the host hash, we can | |
675 | * skip the rest of the hashing procedure | |
676 | */ | |
677 | if (use_skbhash && !hash_hosts) | |
678 | goto skip_hash; | |
ea825115 | 679 | |
046f6fd5 THJ |
680 | /* flow_hash_from_keys() sorts the addresses by value, so we have |
681 | * to preserve their order in a separate data structure to treat | |
682 | * src and dst host addresses as independently selectable. | |
683 | */ | |
684 | host_keys = keys; | |
685 | host_keys.ports.ports = 0; | |
686 | host_keys.basic.ip_proto = 0; | |
687 | host_keys.keyid.keyid = 0; | |
688 | host_keys.tags.flow_label = 0; | |
689 | ||
690 | switch (host_keys.control.addr_type) { | |
691 | case FLOW_DISSECTOR_KEY_IPV4_ADDRS: | |
692 | host_keys.addrs.v4addrs.src = 0; | |
693 | dsthost_hash = flow_hash_from_keys(&host_keys); | |
694 | host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; | |
695 | host_keys.addrs.v4addrs.dst = 0; | |
696 | srchost_hash = flow_hash_from_keys(&host_keys); | |
697 | break; | |
698 | ||
699 | case FLOW_DISSECTOR_KEY_IPV6_ADDRS: | |
700 | memset(&host_keys.addrs.v6addrs.src, 0, | |
701 | sizeof(host_keys.addrs.v6addrs.src)); | |
702 | dsthost_hash = flow_hash_from_keys(&host_keys); | |
703 | host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; | |
704 | memset(&host_keys.addrs.v6addrs.dst, 0, | |
705 | sizeof(host_keys.addrs.v6addrs.dst)); | |
706 | srchost_hash = flow_hash_from_keys(&host_keys); | |
707 | break; | |
708 | ||
709 | default: | |
710 | dsthost_hash = 0; | |
711 | srchost_hash = 0; | |
712 | } | |
713 | ||
714 | /* This *must* be after the above switch, since as a | |
715 | * side-effect it sorts the src and dst addresses. | |
716 | */ | |
b0c19ed6 | 717 | if (hash_flows && !use_skbhash) |
046f6fd5 THJ |
718 | flow_hash = flow_hash_from_keys(&keys); |
719 | ||
93cfb6c1 THJ |
720 | skip_hash: |
721 | if (flow_override) | |
722 | flow_hash = flow_override - 1; | |
86b9bbd3 | 723 | else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS)) |
b0c19ed6 | 724 | flow_hash = skb->hash; |
93cfb6c1 THJ |
725 | if (host_override) { |
726 | dsthost_hash = host_override - 1; | |
727 | srchost_hash = host_override - 1; | |
728 | } | |
729 | ||
046f6fd5 THJ |
730 | if (!(flow_mode & CAKE_FLOW_FLOWS)) { |
731 | if (flow_mode & CAKE_FLOW_SRC_IP) | |
732 | flow_hash ^= srchost_hash; | |
733 | ||
734 | if (flow_mode & CAKE_FLOW_DST_IP) | |
735 | flow_hash ^= dsthost_hash; | |
736 | } | |
737 | ||
738 | reduced_hash = flow_hash % CAKE_QUEUES; | |
739 | ||
740 | /* set-associative hashing */ | |
741 | /* fast path if no hash collision (direct lookup succeeds) */ | |
742 | if (likely(q->tags[reduced_hash] == flow_hash && | |
743 | q->flows[reduced_hash].set)) { | |
744 | q->way_directs++; | |
745 | } else { | |
746 | u32 inner_hash = reduced_hash % CAKE_SET_WAYS; | |
747 | u32 outer_hash = reduced_hash - inner_hash; | |
748 | bool allocate_src = false; | |
749 | bool allocate_dst = false; | |
750 | u32 i, k; | |
751 | ||
752 | /* check if any active queue in the set is reserved for | |
753 | * this flow. | |
754 | */ | |
755 | for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; | |
756 | i++, k = (k + 1) % CAKE_SET_WAYS) { | |
757 | if (q->tags[outer_hash + k] == flow_hash) { | |
758 | if (i) | |
759 | q->way_hits++; | |
760 | ||
761 | if (!q->flows[outer_hash + k].set) { | |
762 | /* need to increment host refcnts */ | |
763 | allocate_src = cake_dsrc(flow_mode); | |
764 | allocate_dst = cake_ddst(flow_mode); | |
765 | } | |
766 | ||
767 | goto found; | |
768 | } | |
769 | } | |
770 | ||
771 | /* no queue is reserved for this flow, look for an | |
772 | * empty one. | |
773 | */ | |
774 | for (i = 0; i < CAKE_SET_WAYS; | |
775 | i++, k = (k + 1) % CAKE_SET_WAYS) { | |
776 | if (!q->flows[outer_hash + k].set) { | |
777 | q->way_misses++; | |
778 | allocate_src = cake_dsrc(flow_mode); | |
779 | allocate_dst = cake_ddst(flow_mode); | |
780 | goto found; | |
781 | } | |
782 | } | |
783 | ||
784 | /* With no empty queues, default to the original | |
785 | * queue, accept the collision, update the host tags. | |
786 | */ | |
787 | q->way_collisions++; | |
71263992 GA |
788 | if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { |
789 | q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--; | |
790 | q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--; | |
791 | } | |
046f6fd5 THJ |
792 | allocate_src = cake_dsrc(flow_mode); |
793 | allocate_dst = cake_ddst(flow_mode); | |
794 | found: | |
795 | /* reserve queue for future packets in same flow */ | |
796 | reduced_hash = outer_hash + k; | |
797 | q->tags[reduced_hash] = flow_hash; | |
798 | ||
799 | if (allocate_src) { | |
800 | srchost_idx = srchost_hash % CAKE_QUEUES; | |
801 | inner_hash = srchost_idx % CAKE_SET_WAYS; | |
802 | outer_hash = srchost_idx - inner_hash; | |
803 | for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; | |
804 | i++, k = (k + 1) % CAKE_SET_WAYS) { | |
805 | if (q->hosts[outer_hash + k].srchost_tag == | |
806 | srchost_hash) | |
807 | goto found_src; | |
808 | } | |
809 | for (i = 0; i < CAKE_SET_WAYS; | |
810 | i++, k = (k + 1) % CAKE_SET_WAYS) { | |
71263992 | 811 | if (!q->hosts[outer_hash + k].srchost_bulk_flow_count) |
046f6fd5 THJ |
812 | break; |
813 | } | |
814 | q->hosts[outer_hash + k].srchost_tag = srchost_hash; | |
815 | found_src: | |
816 | srchost_idx = outer_hash + k; | |
71263992 GA |
817 | if (q->flows[reduced_hash].set == CAKE_SET_BULK) |
818 | q->hosts[srchost_idx].srchost_bulk_flow_count++; | |
046f6fd5 THJ |
819 | q->flows[reduced_hash].srchost = srchost_idx; |
820 | } | |
821 | ||
822 | if (allocate_dst) { | |
823 | dsthost_idx = dsthost_hash % CAKE_QUEUES; | |
824 | inner_hash = dsthost_idx % CAKE_SET_WAYS; | |
825 | outer_hash = dsthost_idx - inner_hash; | |
826 | for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; | |
827 | i++, k = (k + 1) % CAKE_SET_WAYS) { | |
828 | if (q->hosts[outer_hash + k].dsthost_tag == | |
829 | dsthost_hash) | |
830 | goto found_dst; | |
831 | } | |
832 | for (i = 0; i < CAKE_SET_WAYS; | |
833 | i++, k = (k + 1) % CAKE_SET_WAYS) { | |
71263992 | 834 | if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count) |
046f6fd5 THJ |
835 | break; |
836 | } | |
837 | q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; | |
838 | found_dst: | |
839 | dsthost_idx = outer_hash + k; | |
71263992 GA |
840 | if (q->flows[reduced_hash].set == CAKE_SET_BULK) |
841 | q->hosts[dsthost_idx].dsthost_bulk_flow_count++; | |
046f6fd5 THJ |
842 | q->flows[reduced_hash].dsthost = dsthost_idx; |
843 | } | |
844 | } | |
845 | ||
846 | return reduced_hash; | |
847 | } | |
848 | ||
849 | /* helper functions : might be changed when/if skb use a standard list_head */ | |
850 | /* remove one skb from head of slot queue */ | |
851 | ||
852 | static struct sk_buff *dequeue_head(struct cake_flow *flow) | |
853 | { | |
854 | struct sk_buff *skb = flow->head; | |
855 | ||
856 | if (skb) { | |
857 | flow->head = skb->next; | |
a8305bff | 858 | skb_mark_not_on_list(skb); |
046f6fd5 THJ |
859 | } |
860 | ||
861 | return skb; | |
862 | } | |
863 | ||
864 | /* add skb to flow queue (tail add) */ | |
865 | ||
866 | static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) | |
867 | { | |
868 | if (!flow->head) | |
869 | flow->head = skb; | |
870 | else | |
871 | flow->tail->next = skb; | |
872 | flow->tail = skb; | |
873 | skb->next = NULL; | |
874 | } | |
875 | ||
8b713881 THJ |
876 | static struct iphdr *cake_get_iphdr(const struct sk_buff *skb, |
877 | struct ipv6hdr *buf) | |
878 | { | |
879 | unsigned int offset = skb_network_offset(skb); | |
880 | struct iphdr *iph; | |
881 | ||
882 | iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf); | |
883 | ||
884 | if (!iph) | |
885 | return NULL; | |
886 | ||
887 | if (iph->version == 4 && iph->protocol == IPPROTO_IPV6) | |
888 | return skb_header_pointer(skb, offset + iph->ihl * 4, | |
889 | sizeof(struct ipv6hdr), buf); | |
890 | ||
891 | else if (iph->version == 4) | |
892 | return iph; | |
893 | ||
894 | else if (iph->version == 6) | |
895 | return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr), | |
896 | buf); | |
897 | ||
898 | return NULL; | |
899 | } | |
900 | ||
901 | static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, | |
902 | void *buf, unsigned int bufsize) | |
903 | { | |
904 | unsigned int offset = skb_network_offset(skb); | |
905 | const struct ipv6hdr *ipv6h; | |
906 | const struct tcphdr *tcph; | |
907 | const struct iphdr *iph; | |
908 | struct ipv6hdr _ipv6h; | |
909 | struct tcphdr _tcph; | |
910 | ||
911 | ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); | |
912 | ||
913 | if (!ipv6h) | |
914 | return NULL; | |
915 | ||
916 | if (ipv6h->version == 4) { | |
917 | iph = (struct iphdr *)ipv6h; | |
918 | offset += iph->ihl * 4; | |
919 | ||
920 | /* special-case 6in4 tunnelling, as that is a common way to get | |
921 | * v6 connectivity in the home | |
922 | */ | |
923 | if (iph->protocol == IPPROTO_IPV6) { | |
924 | ipv6h = skb_header_pointer(skb, offset, | |
925 | sizeof(_ipv6h), &_ipv6h); | |
926 | ||
927 | if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) | |
928 | return NULL; | |
929 | ||
930 | offset += sizeof(struct ipv6hdr); | |
931 | ||
932 | } else if (iph->protocol != IPPROTO_TCP) { | |
933 | return NULL; | |
934 | } | |
935 | ||
936 | } else if (ipv6h->version == 6) { | |
937 | if (ipv6h->nexthdr != IPPROTO_TCP) | |
938 | return NULL; | |
939 | ||
940 | offset += sizeof(struct ipv6hdr); | |
941 | } else { | |
942 | return NULL; | |
943 | } | |
944 | ||
945 | tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); | |
ba91c49d | 946 | if (!tcph || tcph->doff < 5) |
8b713881 THJ |
947 | return NULL; |
948 | ||
949 | return skb_header_pointer(skb, offset, | |
950 | min(__tcp_hdrlen(tcph), bufsize), buf); | |
951 | } | |
952 | ||
953 | static const void *cake_get_tcpopt(const struct tcphdr *tcph, | |
954 | int code, int *oplen) | |
955 | { | |
956 | /* inspired by tcp_parse_options in tcp_input.c */ | |
957 | int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); | |
958 | const u8 *ptr = (const u8 *)(tcph + 1); | |
959 | ||
960 | while (length > 0) { | |
961 | int opcode = *ptr++; | |
962 | int opsize; | |
963 | ||
964 | if (opcode == TCPOPT_EOL) | |
965 | break; | |
966 | if (opcode == TCPOPT_NOP) { | |
967 | length--; | |
968 | continue; | |
969 | } | |
ba91c49d MM |
970 | if (length < 2) |
971 | break; | |
8b713881 THJ |
972 | opsize = *ptr++; |
973 | if (opsize < 2 || opsize > length) | |
974 | break; | |
975 | ||
976 | if (opcode == code) { | |
977 | *oplen = opsize; | |
978 | return ptr; | |
979 | } | |
980 | ||
981 | ptr += opsize - 2; | |
982 | length -= opsize; | |
983 | } | |
984 | ||
985 | return NULL; | |
986 | } | |
987 | ||
988 | /* Compare two SACK sequences. A sequence is considered greater if it SACKs more | |
989 | * bytes than the other. In the case where both sequences ACKs bytes that the | |
990 | * other doesn't, A is considered greater. DSACKs in A also makes A be | |
991 | * considered greater. | |
992 | * | |
993 | * @return -1, 0 or 1 as normal compare functions | |
994 | */ | |
995 | static int cake_tcph_sack_compare(const struct tcphdr *tcph_a, | |
996 | const struct tcphdr *tcph_b) | |
997 | { | |
998 | const struct tcp_sack_block_wire *sack_a, *sack_b; | |
999 | u32 ack_seq_a = ntohl(tcph_a->ack_seq); | |
1000 | u32 bytes_a = 0, bytes_b = 0; | |
1001 | int oplen_a, oplen_b; | |
1002 | bool first = true; | |
1003 | ||
1004 | sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a); | |
1005 | sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b); | |
1006 | ||
1007 | /* pointers point to option contents */ | |
1008 | oplen_a -= TCPOLEN_SACK_BASE; | |
1009 | oplen_b -= TCPOLEN_SACK_BASE; | |
1010 | ||
1011 | if (sack_a && oplen_a >= sizeof(*sack_a) && | |
1012 | (!sack_b || oplen_b < sizeof(*sack_b))) | |
1013 | return -1; | |
1014 | else if (sack_b && oplen_b >= sizeof(*sack_b) && | |
1015 | (!sack_a || oplen_a < sizeof(*sack_a))) | |
1016 | return 1; | |
1017 | else if ((!sack_a || oplen_a < sizeof(*sack_a)) && | |
1018 | (!sack_b || oplen_b < sizeof(*sack_b))) | |
1019 | return 0; | |
1020 | ||
1021 | while (oplen_a >= sizeof(*sack_a)) { | |
1022 | const struct tcp_sack_block_wire *sack_tmp = sack_b; | |
1023 | u32 start_a = get_unaligned_be32(&sack_a->start_seq); | |
1024 | u32 end_a = get_unaligned_be32(&sack_a->end_seq); | |
1025 | int oplen_tmp = oplen_b; | |
1026 | bool found = false; | |
1027 | ||
1028 | /* DSACK; always considered greater to prevent dropping */ | |
1029 | if (before(start_a, ack_seq_a)) | |
1030 | return -1; | |
1031 | ||
1032 | bytes_a += end_a - start_a; | |
1033 | ||
1034 | while (oplen_tmp >= sizeof(*sack_tmp)) { | |
1035 | u32 start_b = get_unaligned_be32(&sack_tmp->start_seq); | |
1036 | u32 end_b = get_unaligned_be32(&sack_tmp->end_seq); | |
1037 | ||
1038 | /* first time through we count the total size */ | |
1039 | if (first) | |
1040 | bytes_b += end_b - start_b; | |
1041 | ||
1042 | if (!after(start_b, start_a) && !before(end_b, end_a)) { | |
1043 | found = true; | |
1044 | if (!first) | |
1045 | break; | |
1046 | } | |
1047 | oplen_tmp -= sizeof(*sack_tmp); | |
1048 | sack_tmp++; | |
1049 | } | |
1050 | ||
1051 | if (!found) | |
1052 | return -1; | |
1053 | ||
1054 | oplen_a -= sizeof(*sack_a); | |
1055 | sack_a++; | |
1056 | first = false; | |
1057 | } | |
1058 | ||
1059 | /* If we made it this far, all ranges SACKed by A are covered by B, so | |
1060 | * either the SACKs are equal, or B SACKs more bytes. | |
1061 | */ | |
1062 | return bytes_b > bytes_a ? 1 : 0; | |
1063 | } | |
1064 | ||
1065 | static void cake_tcph_get_tstamp(const struct tcphdr *tcph, | |
1066 | u32 *tsval, u32 *tsecr) | |
1067 | { | |
1068 | const u8 *ptr; | |
1069 | int opsize; | |
1070 | ||
1071 | ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize); | |
1072 | ||
1073 | if (ptr && opsize == TCPOLEN_TIMESTAMP) { | |
1074 | *tsval = get_unaligned_be32(ptr); | |
1075 | *tsecr = get_unaligned_be32(ptr + 4); | |
1076 | } | |
1077 | } | |
1078 | ||
1079 | static bool cake_tcph_may_drop(const struct tcphdr *tcph, | |
1080 | u32 tstamp_new, u32 tsecr_new) | |
1081 | { | |
1082 | /* inspired by tcp_parse_options in tcp_input.c */ | |
1083 | int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); | |
1084 | const u8 *ptr = (const u8 *)(tcph + 1); | |
1085 | u32 tstamp, tsecr; | |
1086 | ||
1087 | /* 3 reserved flags must be unset to avoid future breakage | |
1088 | * ACK must be set | |
1089 | * ECE/CWR are handled separately | |
1090 | * All other flags URG/PSH/RST/SYN/FIN must be unset | |
1091 | * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero) | |
1092 | * 0x00C00000 = CWR/ECE (handled separately) | |
1093 | * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000 | |
1094 | */ | |
1095 | if (((tcp_flag_word(tcph) & | |
1096 | cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK)) | |
1097 | return false; | |
1098 | ||
1099 | while (length > 0) { | |
1100 | int opcode = *ptr++; | |
1101 | int opsize; | |
1102 | ||
1103 | if (opcode == TCPOPT_EOL) | |
1104 | break; | |
1105 | if (opcode == TCPOPT_NOP) { | |
1106 | length--; | |
1107 | continue; | |
1108 | } | |
ba91c49d MM |
1109 | if (length < 2) |
1110 | break; | |
8b713881 THJ |
1111 | opsize = *ptr++; |
1112 | if (opsize < 2 || opsize > length) | |
1113 | break; | |
1114 | ||
1115 | switch (opcode) { | |
1116 | case TCPOPT_MD5SIG: /* doesn't influence state */ | |
1117 | break; | |
1118 | ||
1119 | case TCPOPT_SACK: /* stricter checking performed later */ | |
1120 | if (opsize % 8 != 2) | |
1121 | return false; | |
1122 | break; | |
1123 | ||
1124 | case TCPOPT_TIMESTAMP: | |
1125 | /* only drop timestamps lower than new */ | |
1126 | if (opsize != TCPOLEN_TIMESTAMP) | |
1127 | return false; | |
1128 | tstamp = get_unaligned_be32(ptr); | |
1129 | tsecr = get_unaligned_be32(ptr + 4); | |
1130 | if (after(tstamp, tstamp_new) || | |
1131 | after(tsecr, tsecr_new)) | |
1132 | return false; | |
1133 | break; | |
1134 | ||
1135 | case TCPOPT_MSS: /* these should only be set on SYN */ | |
1136 | case TCPOPT_WINDOW: | |
1137 | case TCPOPT_SACK_PERM: | |
1138 | case TCPOPT_FASTOPEN: | |
1139 | case TCPOPT_EXP: | |
1140 | default: /* don't drop if any unknown options are present */ | |
1141 | return false; | |
1142 | } | |
1143 | ||
1144 | ptr += opsize - 2; | |
1145 | length -= opsize; | |
1146 | } | |
1147 | ||
1148 | return true; | |
1149 | } | |
1150 | ||
1151 | static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, | |
1152 | struct cake_flow *flow) | |
1153 | { | |
1154 | bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE; | |
1155 | struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL; | |
1156 | struct sk_buff *skb_check, *skb_prev = NULL; | |
1157 | const struct ipv6hdr *ipv6h, *ipv6h_check; | |
1158 | unsigned char _tcph[64], _tcph_check[64]; | |
1159 | const struct tcphdr *tcph, *tcph_check; | |
1160 | const struct iphdr *iph, *iph_check; | |
1161 | struct ipv6hdr _iph, _iph_check; | |
1162 | const struct sk_buff *skb; | |
1163 | int seglen, num_found = 0; | |
1164 | u32 tstamp = 0, tsecr = 0; | |
1165 | __be32 elig_flags = 0; | |
1166 | int sack_comp; | |
1167 | ||
1168 | /* no other possible ACKs to filter */ | |
1169 | if (flow->head == flow->tail) | |
1170 | return NULL; | |
1171 | ||
1172 | skb = flow->tail; | |
1173 | tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph)); | |
1174 | iph = cake_get_iphdr(skb, &_iph); | |
1175 | if (!tcph) | |
1176 | return NULL; | |
1177 | ||
1178 | cake_tcph_get_tstamp(tcph, &tstamp, &tsecr); | |
1179 | ||
1180 | /* the 'triggering' packet need only have the ACK flag set. | |
1181 | * also check that SYN is not set, as there won't be any previous ACKs. | |
1182 | */ | |
1183 | if ((tcp_flag_word(tcph) & | |
1184 | (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK) | |
1185 | return NULL; | |
1186 | ||
1187 | /* the 'triggering' ACK is at the tail of the queue, we have already | |
1188 | * returned if it is the only packet in the flow. loop through the rest | |
1189 | * of the queue looking for pure ACKs with the same 5-tuple as the | |
1190 | * triggering one. | |
1191 | */ | |
1192 | for (skb_check = flow->head; | |
1193 | skb_check && skb_check != skb; | |
1194 | skb_prev = skb_check, skb_check = skb_check->next) { | |
1195 | iph_check = cake_get_iphdr(skb_check, &_iph_check); | |
1196 | tcph_check = cake_get_tcphdr(skb_check, &_tcph_check, | |
1197 | sizeof(_tcph_check)); | |
1198 | ||
1199 | /* only TCP packets with matching 5-tuple are eligible, and only | |
1200 | * drop safe headers | |
1201 | */ | |
1202 | if (!tcph_check || iph->version != iph_check->version || | |
1203 | tcph_check->source != tcph->source || | |
1204 | tcph_check->dest != tcph->dest) | |
1205 | continue; | |
1206 | ||
1207 | if (iph_check->version == 4) { | |
1208 | if (iph_check->saddr != iph->saddr || | |
1209 | iph_check->daddr != iph->daddr) | |
1210 | continue; | |
1211 | ||
043e397e | 1212 | seglen = iph_totlen(skb, iph_check) - |
8b713881 THJ |
1213 | (4 * iph_check->ihl); |
1214 | } else if (iph_check->version == 6) { | |
1215 | ipv6h = (struct ipv6hdr *)iph; | |
1216 | ipv6h_check = (struct ipv6hdr *)iph_check; | |
1217 | ||
1218 | if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) || | |
1219 | ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr)) | |
1220 | continue; | |
1221 | ||
1222 | seglen = ntohs(ipv6h_check->payload_len); | |
1223 | } else { | |
1224 | WARN_ON(1); /* shouldn't happen */ | |
1225 | continue; | |
1226 | } | |
1227 | ||
1228 | /* If the ECE/CWR flags changed from the previous eligible | |
1229 | * packet in the same flow, we should no longer be dropping that | |
1230 | * previous packet as this would lose information. | |
1231 | */ | |
1232 | if (elig_ack && (tcp_flag_word(tcph_check) & | |
1233 | (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) { | |
1234 | elig_ack = NULL; | |
1235 | elig_ack_prev = NULL; | |
1236 | num_found--; | |
1237 | } | |
1238 | ||
1239 | /* Check TCP options and flags, don't drop ACKs with segment | |
1240 | * data, and don't drop ACKs with a higher cumulative ACK | |
1241 | * counter than the triggering packet. Check ACK seqno here to | |
1242 | * avoid parsing SACK options of packets we are going to exclude | |
1243 | * anyway. | |
1244 | */ | |
1245 | if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) || | |
1246 | (seglen - __tcp_hdrlen(tcph_check)) != 0 || | |
1247 | after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq))) | |
1248 | continue; | |
1249 | ||
1250 | /* Check SACK options. The triggering packet must SACK more data | |
1251 | * than the ACK under consideration, or SACK the same range but | |
1252 | * have a larger cumulative ACK counter. The latter is a | |
1253 | * pathological case, but is contained in the following check | |
1254 | * anyway, just to be safe. | |
1255 | */ | |
1256 | sack_comp = cake_tcph_sack_compare(tcph_check, tcph); | |
1257 | ||
1258 | if (sack_comp < 0 || | |
1259 | (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) && | |
1260 | sack_comp == 0)) | |
1261 | continue; | |
1262 | ||
1263 | /* At this point we have found an eligible pure ACK to drop; if | |
1264 | * we are in aggressive mode, we are done. Otherwise, keep | |
1265 | * searching unless this is the second eligible ACK we | |
1266 | * found. | |
1267 | * | |
1268 | * Since we want to drop ACK closest to the head of the queue, | |
1269 | * save the first eligible ACK we find, even if we need to loop | |
1270 | * again. | |
1271 | */ | |
1272 | if (!elig_ack) { | |
1273 | elig_ack = skb_check; | |
1274 | elig_ack_prev = skb_prev; | |
1275 | elig_flags = (tcp_flag_word(tcph_check) | |
1276 | & (TCP_FLAG_ECE | TCP_FLAG_CWR)); | |
1277 | } | |
1278 | ||
1279 | if (num_found++ > 0) | |
1280 | goto found; | |
1281 | } | |
1282 | ||
1283 | /* We made it through the queue without finding two eligible ACKs . If | |
1284 | * we found a single eligible ACK we can drop it in aggressive mode if | |
1285 | * we can guarantee that this does not interfere with ECN flag | |
1286 | * information. We ensure this by dropping it only if the enqueued | |
1287 | * packet is consecutive with the eligible ACK, and their flags match. | |
1288 | */ | |
1289 | if (elig_ack && aggressive && elig_ack->next == skb && | |
1290 | (elig_flags == (tcp_flag_word(tcph) & | |
1291 | (TCP_FLAG_ECE | TCP_FLAG_CWR)))) | |
1292 | goto found; | |
1293 | ||
1294 | return NULL; | |
1295 | ||
1296 | found: | |
1297 | if (elig_ack_prev) | |
1298 | elig_ack_prev->next = elig_ack->next; | |
1299 | else | |
1300 | flow->head = elig_ack->next; | |
1301 | ||
a8305bff | 1302 | skb_mark_not_on_list(elig_ack); |
8b713881 THJ |
1303 | |
1304 | return elig_ack; | |
1305 | } | |
1306 | ||
046f6fd5 THJ |
1307 | static u64 cake_ewma(u64 avg, u64 sample, u32 shift) |
1308 | { | |
1309 | avg -= avg >> shift; | |
1310 | avg += sample >> shift; | |
1311 | return avg; | |
1312 | } | |
1313 | ||
a729b7f0 THJ |
1314 | static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off) |
1315 | { | |
1316 | if (q->rate_flags & CAKE_FLAG_OVERHEAD) | |
1317 | len -= off; | |
1318 | ||
1319 | if (q->max_netlen < len) | |
1320 | q->max_netlen = len; | |
1321 | if (q->min_netlen > len) | |
1322 | q->min_netlen = len; | |
1323 | ||
1324 | len += q->rate_overhead; | |
1325 | ||
1326 | if (len < q->rate_mpu) | |
1327 | len = q->rate_mpu; | |
1328 | ||
1329 | if (q->atm_mode == CAKE_ATM_ATM) { | |
1330 | len += 47; | |
1331 | len /= 48; | |
1332 | len *= 53; | |
1333 | } else if (q->atm_mode == CAKE_ATM_PTM) { | |
1334 | /* Add one byte per 64 bytes or part thereof. | |
1335 | * This is conservative and easier to calculate than the | |
1336 | * precise value. | |
1337 | */ | |
1338 | len += (len + 63) / 64; | |
1339 | } | |
1340 | ||
1341 | if (q->max_adjlen < len) | |
1342 | q->max_adjlen = len; | |
1343 | if (q->min_adjlen > len) | |
1344 | q->min_adjlen = len; | |
1345 | ||
1346 | return len; | |
1347 | } | |
1348 | ||
1349 | static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) | |
1350 | { | |
1351 | const struct skb_shared_info *shinfo = skb_shinfo(skb); | |
1352 | unsigned int hdr_len, last_len = 0; | |
1353 | u32 off = skb_network_offset(skb); | |
1354 | u32 len = qdisc_pkt_len(skb); | |
1355 | u16 segs = 1; | |
1356 | ||
1357 | q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); | |
1358 | ||
1359 | if (!shinfo->gso_size) | |
1360 | return cake_calc_overhead(q, len, off); | |
1361 | ||
1362 | /* borrowed from qdisc_pkt_len_init() */ | |
1363 | hdr_len = skb_transport_header(skb) - skb_mac_header(skb); | |
1364 | ||
1365 | /* + transport layer */ | |
1366 | if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | | |
1367 | SKB_GSO_TCPV6))) { | |
1368 | const struct tcphdr *th; | |
1369 | struct tcphdr _tcphdr; | |
1370 | ||
1371 | th = skb_header_pointer(skb, skb_transport_offset(skb), | |
1372 | sizeof(_tcphdr), &_tcphdr); | |
1373 | if (likely(th)) | |
1374 | hdr_len += __tcp_hdrlen(th); | |
1375 | } else { | |
1376 | struct udphdr _udphdr; | |
1377 | ||
1378 | if (skb_header_pointer(skb, skb_transport_offset(skb), | |
1379 | sizeof(_udphdr), &_udphdr)) | |
1380 | hdr_len += sizeof(struct udphdr); | |
1381 | } | |
1382 | ||
1383 | if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) | |
1384 | segs = DIV_ROUND_UP(skb->len - hdr_len, | |
1385 | shinfo->gso_size); | |
1386 | else | |
1387 | segs = shinfo->gso_segs; | |
1388 | ||
1389 | len = shinfo->gso_size + hdr_len; | |
1390 | last_len = skb->len - shinfo->gso_size * (segs - 1); | |
1391 | ||
1392 | return (cake_calc_overhead(q, len, off) * (segs - 1) + | |
1393 | cake_calc_overhead(q, last_len, off)); | |
1394 | } | |
1395 | ||
046f6fd5 THJ |
1396 | static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j) |
1397 | { | |
1398 | struct cake_heap_entry ii = q->overflow_heap[i]; | |
1399 | struct cake_heap_entry jj = q->overflow_heap[j]; | |
1400 | ||
1401 | q->overflow_heap[i] = jj; | |
1402 | q->overflow_heap[j] = ii; | |
1403 | ||
1404 | q->tins[ii.t].overflow_idx[ii.b] = j; | |
1405 | q->tins[jj.t].overflow_idx[jj.b] = i; | |
1406 | } | |
1407 | ||
1408 | static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i) | |
1409 | { | |
1410 | struct cake_heap_entry ii = q->overflow_heap[i]; | |
1411 | ||
1412 | return q->tins[ii.t].backlogs[ii.b]; | |
1413 | } | |
1414 | ||
1415 | static void cake_heapify(struct cake_sched_data *q, u16 i) | |
1416 | { | |
1417 | static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES; | |
1418 | u32 mb = cake_heap_get_backlog(q, i); | |
1419 | u32 m = i; | |
1420 | ||
1421 | while (m < a) { | |
1422 | u32 l = m + m + 1; | |
1423 | u32 r = l + 1; | |
1424 | ||
1425 | if (l < a) { | |
1426 | u32 lb = cake_heap_get_backlog(q, l); | |
1427 | ||
1428 | if (lb > mb) { | |
1429 | m = l; | |
1430 | mb = lb; | |
1431 | } | |
1432 | } | |
1433 | ||
1434 | if (r < a) { | |
1435 | u32 rb = cake_heap_get_backlog(q, r); | |
1436 | ||
1437 | if (rb > mb) { | |
1438 | m = r; | |
1439 | mb = rb; | |
1440 | } | |
1441 | } | |
1442 | ||
1443 | if (m != i) { | |
1444 | cake_heap_swap(q, i, m); | |
1445 | i = m; | |
1446 | } else { | |
1447 | break; | |
1448 | } | |
1449 | } | |
1450 | } | |
1451 | ||
1452 | static void cake_heapify_up(struct cake_sched_data *q, u16 i) | |
1453 | { | |
1454 | while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) { | |
1455 | u16 p = (i - 1) >> 1; | |
1456 | u32 ib = cake_heap_get_backlog(q, i); | |
1457 | u32 pb = cake_heap_get_backlog(q, p); | |
1458 | ||
1459 | if (ib > pb) { | |
1460 | cake_heap_swap(q, i, p); | |
1461 | i = p; | |
1462 | } else { | |
1463 | break; | |
1464 | } | |
1465 | } | |
1466 | } | |
1467 | ||
1468 | static int cake_advance_shaper(struct cake_sched_data *q, | |
1469 | struct cake_tin_data *b, | |
1470 | struct sk_buff *skb, | |
1471 | ktime_t now, bool drop) | |
1472 | { | |
a729b7f0 | 1473 | u32 len = get_cobalt_cb(skb)->adjusted_len; |
046f6fd5 THJ |
1474 | |
1475 | /* charge packet bandwidth to this tin | |
1476 | * and to the global shaper. | |
1477 | */ | |
1478 | if (q->rate_ns) { | |
1479 | u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft; | |
1480 | u64 global_dur = (len * q->rate_ns) >> q->rate_shft; | |
1481 | u64 failsafe_dur = global_dur + (global_dur >> 1); | |
1482 | ||
1483 | if (ktime_before(b->time_next_packet, now)) | |
1484 | b->time_next_packet = ktime_add_ns(b->time_next_packet, | |
1485 | tin_dur); | |
1486 | ||
1487 | else if (ktime_before(b->time_next_packet, | |
1488 | ktime_add_ns(now, tin_dur))) | |
1489 | b->time_next_packet = ktime_add_ns(now, tin_dur); | |
1490 | ||
1491 | q->time_next_packet = ktime_add_ns(q->time_next_packet, | |
1492 | global_dur); | |
1493 | if (!drop) | |
1494 | q->failsafe_next_packet = \ | |
1495 | ktime_add_ns(q->failsafe_next_packet, | |
1496 | failsafe_dur); | |
1497 | } | |
1498 | return len; | |
1499 | } | |
1500 | ||
1501 | static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) | |
1502 | { | |
1503 | struct cake_sched_data *q = qdisc_priv(sch); | |
1504 | ktime_t now = ktime_get(); | |
1505 | u32 idx = 0, tin = 0, len; | |
1506 | struct cake_heap_entry qq; | |
1507 | struct cake_tin_data *b; | |
1508 | struct cake_flow *flow; | |
1509 | struct sk_buff *skb; | |
1510 | ||
1511 | if (!q->overflow_timeout) { | |
1512 | int i; | |
1513 | /* Build fresh max-heap */ | |
1514 | for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--) | |
1515 | cake_heapify(q, i); | |
1516 | } | |
1517 | q->overflow_timeout = 65535; | |
1518 | ||
1519 | /* select longest queue for pruning */ | |
1520 | qq = q->overflow_heap[0]; | |
1521 | tin = qq.t; | |
1522 | idx = qq.b; | |
1523 | ||
1524 | b = &q->tins[tin]; | |
1525 | flow = &b->flows[idx]; | |
1526 | skb = dequeue_head(flow); | |
1527 | if (unlikely(!skb)) { | |
1528 | /* heap has gone wrong, rebuild it next time */ | |
1529 | q->overflow_timeout = 0; | |
1530 | return idx + (tin << 16); | |
1531 | } | |
1532 | ||
1533 | if (cobalt_queue_full(&flow->cvars, &b->cparams, now)) | |
1534 | b->unresponsive_flow_count++; | |
1535 | ||
1536 | len = qdisc_pkt_len(skb); | |
1537 | q->buffer_used -= skb->truesize; | |
1538 | b->backlogs[idx] -= len; | |
1539 | b->tin_backlog -= len; | |
1540 | sch->qstats.backlog -= len; | |
1541 | qdisc_tree_reduce_backlog(sch, 1, len); | |
1542 | ||
1543 | flow->dropped++; | |
1544 | b->tin_dropped++; | |
1545 | sch->qstats.drops++; | |
1546 | ||
7298de9c THJ |
1547 | if (q->rate_flags & CAKE_FLAG_INGRESS) |
1548 | cake_advance_shaper(q, b, skb, now, true); | |
1549 | ||
046f6fd5 THJ |
1550 | __qdisc_drop(skb, to_free); |
1551 | sch->q.qlen--; | |
1552 | ||
1553 | cake_heapify(q, 0); | |
1554 | ||
1555 | return idx + (tin << 16); | |
1556 | } | |
1557 | ||
8c95eca0 | 1558 | static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash) |
83f8fd69 | 1559 | { |
9208d286 IP |
1560 | const int offset = skb_network_offset(skb); |
1561 | u16 *buf, buf_; | |
83f8fd69 THJ |
1562 | u8 dscp; |
1563 | ||
d7bf2ebe | 1564 | switch (skb_protocol(skb, true)) { |
83f8fd69 | 1565 | case htons(ETH_P_IP): |
9208d286 IP |
1566 | buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); |
1567 | if (unlikely(!buf)) | |
c87b4ecd THJ |
1568 | return 0; |
1569 | ||
9208d286 IP |
1570 | /* ToS is in the second byte of iphdr */ |
1571 | dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2; | |
1572 | ||
1573 | if (wash && dscp) { | |
1574 | const int wlen = offset + sizeof(struct iphdr); | |
1575 | ||
1576 | if (!pskb_may_pull(skb, wlen) || | |
1577 | skb_try_make_writable(skb, wlen)) | |
1578 | return 0; | |
1579 | ||
83f8fd69 | 1580 | ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); |
9208d286 IP |
1581 | } |
1582 | ||
83f8fd69 THJ |
1583 | return dscp; |
1584 | ||
1585 | case htons(ETH_P_IPV6): | |
9208d286 IP |
1586 | buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); |
1587 | if (unlikely(!buf)) | |
c87b4ecd THJ |
1588 | return 0; |
1589 | ||
9208d286 IP |
1590 | /* Traffic class is in the first and second bytes of ipv6hdr */ |
1591 | dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2; | |
1592 | ||
1593 | if (wash && dscp) { | |
1594 | const int wlen = offset + sizeof(struct ipv6hdr); | |
1595 | ||
1596 | if (!pskb_may_pull(skb, wlen) || | |
1597 | skb_try_make_writable(skb, wlen)) | |
1598 | return 0; | |
1599 | ||
83f8fd69 | 1600 | ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); |
9208d286 IP |
1601 | } |
1602 | ||
83f8fd69 THJ |
1603 | return dscp; |
1604 | ||
1605 | case htons(ETH_P_ARP): | |
1606 | return 0x38; /* CS7 - Net Control */ | |
1607 | ||
1608 | default: | |
1609 | /* If there is no Diffserv field, treat as best-effort */ | |
1610 | return 0; | |
1611 | } | |
1612 | } | |
1613 | ||
1614 | static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, | |
1615 | struct sk_buff *skb) | |
1616 | { | |
1617 | struct cake_sched_data *q = qdisc_priv(sch); | |
eab2fc82 | 1618 | u32 tin, mark; |
8c95eca0 | 1619 | bool wash; |
4976e3c6 | 1620 | u8 dscp; |
83f8fd69 | 1621 | |
4976e3c6 | 1622 | /* Tin selection: Default to diffserv-based selection, allow overriding |
8c95eca0 THJ |
1623 | * using firewall marks or skb->priority. Call DSCP parsing early if |
1624 | * wash is enabled, otherwise defer to below to skip unneeded parsing. | |
4976e3c6 | 1625 | */ |
eab2fc82 | 1626 | mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft; |
8c95eca0 THJ |
1627 | wash = !!(q->rate_flags & CAKE_FLAG_WASH); |
1628 | if (wash) | |
1629 | dscp = cake_handle_diffserv(skb, wash); | |
83f8fd69 | 1630 | |
4976e3c6 | 1631 | if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT) |
83f8fd69 | 1632 | tin = 0; |
4976e3c6 | 1633 | |
eab2fc82 THJ |
1634 | else if (mark && mark <= q->tin_cnt) |
1635 | tin = q->tin_order[mark - 1]; | |
4976e3c6 THJ |
1636 | |
1637 | else if (TC_H_MAJ(skb->priority) == sch->handle && | |
1638 | TC_H_MIN(skb->priority) > 0 && | |
1639 | TC_H_MIN(skb->priority) <= q->tin_cnt) | |
1640 | tin = q->tin_order[TC_H_MIN(skb->priority) - 1]; | |
1641 | ||
1642 | else { | |
8c95eca0 THJ |
1643 | if (!wash) |
1644 | dscp = cake_handle_diffserv(skb, wash); | |
4976e3c6 THJ |
1645 | tin = q->tin_index[dscp]; |
1646 | ||
1647 | if (unlikely(tin >= q->tin_cnt)) | |
1648 | tin = 0; | |
83f8fd69 THJ |
1649 | } |
1650 | ||
1651 | return &q->tins[tin]; | |
1652 | } | |
1653 | ||
1654 | static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, | |
046f6fd5 THJ |
1655 | struct sk_buff *skb, int flow_mode, int *qerr) |
1656 | { | |
1657 | struct cake_sched_data *q = qdisc_priv(sch); | |
1658 | struct tcf_proto *filter; | |
1659 | struct tcf_result res; | |
93cfb6c1 | 1660 | u16 flow = 0, host = 0; |
046f6fd5 THJ |
1661 | int result; |
1662 | ||
1663 | filter = rcu_dereference_bh(q->filter_list); | |
1664 | if (!filter) | |
83f8fd69 | 1665 | goto hash; |
046f6fd5 THJ |
1666 | |
1667 | *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; | |
3aa26055 | 1668 | result = tcf_classify(skb, NULL, filter, &res, false); |
83f8fd69 | 1669 | |
046f6fd5 THJ |
1670 | if (result >= 0) { |
1671 | #ifdef CONFIG_NET_CLS_ACT | |
1672 | switch (result) { | |
1673 | case TC_ACT_STOLEN: | |
1674 | case TC_ACT_QUEUED: | |
1675 | case TC_ACT_TRAP: | |
1676 | *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; | |
df561f66 | 1677 | fallthrough; |
046f6fd5 THJ |
1678 | case TC_ACT_SHOT: |
1679 | return 0; | |
1680 | } | |
1681 | #endif | |
1682 | if (TC_H_MIN(res.classid) <= CAKE_QUEUES) | |
83f8fd69 | 1683 | flow = TC_H_MIN(res.classid); |
93cfb6c1 THJ |
1684 | if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16)) |
1685 | host = TC_H_MAJ(res.classid) >> 16; | |
046f6fd5 | 1686 | } |
83f8fd69 THJ |
1687 | hash: |
1688 | *t = cake_select_tin(sch, skb); | |
93cfb6c1 | 1689 | return cake_hash(*t, skb, flow_mode, flow, host) + 1; |
046f6fd5 THJ |
1690 | } |
1691 | ||
7298de9c THJ |
1692 | static void cake_reconfigure(struct Qdisc *sch); |
1693 | ||
046f6fd5 THJ |
1694 | static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, |
1695 | struct sk_buff **to_free) | |
1696 | { | |
1697 | struct cake_sched_data *q = qdisc_priv(sch); | |
1698 | int len = qdisc_pkt_len(skb); | |
3f649ab7 | 1699 | int ret; |
8b713881 | 1700 | struct sk_buff *ack = NULL; |
046f6fd5 THJ |
1701 | ktime_t now = ktime_get(); |
1702 | struct cake_tin_data *b; | |
1703 | struct cake_flow *flow; | |
83f8fd69 | 1704 | u32 idx; |
046f6fd5 THJ |
1705 | |
1706 | /* choose flow to insert into */ | |
83f8fd69 | 1707 | idx = cake_classify(sch, &b, skb, q->flow_mode, &ret); |
046f6fd5 THJ |
1708 | if (idx == 0) { |
1709 | if (ret & __NET_XMIT_BYPASS) | |
1710 | qdisc_qstats_drop(sch); | |
1711 | __qdisc_drop(skb, to_free); | |
1712 | return ret; | |
1713 | } | |
1714 | idx--; | |
1715 | flow = &b->flows[idx]; | |
1716 | ||
1717 | /* ensure shaper state isn't stale */ | |
1718 | if (!b->tin_backlog) { | |
1719 | if (ktime_before(b->time_next_packet, now)) | |
1720 | b->time_next_packet = now; | |
1721 | ||
1722 | if (!sch->q.qlen) { | |
1723 | if (ktime_before(q->time_next_packet, now)) { | |
1724 | q->failsafe_next_packet = now; | |
1725 | q->time_next_packet = now; | |
1726 | } else if (ktime_after(q->time_next_packet, now) && | |
1727 | ktime_after(q->failsafe_next_packet, now)) { | |
1728 | u64 next = \ | |
1729 | min(ktime_to_ns(q->time_next_packet), | |
1730 | ktime_to_ns( | |
1731 | q->failsafe_next_packet)); | |
1732 | sch->qstats.overlimits++; | |
1733 | qdisc_watchdog_schedule_ns(&q->watchdog, next); | |
1734 | } | |
1735 | } | |
1736 | } | |
1737 | ||
1738 | if (unlikely(len > b->max_skblen)) | |
1739 | b->max_skblen = len; | |
1740 | ||
0c850344 THJ |
1741 | if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { |
1742 | struct sk_buff *segs, *nskb; | |
1743 | netdev_features_t features = netif_skb_features(skb); | |
8c6c37fd | 1744 | unsigned int slen = 0, numsegs = 0; |
0c850344 THJ |
1745 | |
1746 | segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); | |
1747 | if (IS_ERR_OR_NULL(segs)) | |
1748 | return qdisc_drop(skb, sch, to_free); | |
1749 | ||
b950d8a5 | 1750 | skb_list_walk_safe(segs, segs, nskb) { |
a8305bff | 1751 | skb_mark_not_on_list(segs); |
0c850344 THJ |
1752 | qdisc_skb_cb(segs)->pkt_len = segs->len; |
1753 | cobalt_set_enqueue_time(segs, now); | |
1754 | get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, | |
1755 | segs); | |
1756 | flow_queue_add(flow, segs); | |
1757 | ||
1758 | sch->q.qlen++; | |
8c6c37fd | 1759 | numsegs++; |
0c850344 THJ |
1760 | slen += segs->len; |
1761 | q->buffer_used += segs->truesize; | |
1762 | b->packets++; | |
0c850344 | 1763 | } |
8b713881 | 1764 | |
0c850344 THJ |
1765 | /* stats */ |
1766 | b->bytes += slen; | |
1767 | b->backlogs[idx] += slen; | |
1768 | b->tin_backlog += slen; | |
1769 | sch->qstats.backlog += slen; | |
1770 | q->avg_window_bytes += slen; | |
8b713881 | 1771 | |
8c6c37fd | 1772 | qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen); |
0c850344 | 1773 | consume_skb(skb); |
8b713881 | 1774 | } else { |
0c850344 THJ |
1775 | /* not splitting */ |
1776 | cobalt_set_enqueue_time(skb, now); | |
1777 | get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); | |
1778 | flow_queue_add(flow, skb); | |
1779 | ||
1780 | if (q->ack_filter) | |
1781 | ack = cake_ack_filter(q, flow); | |
1782 | ||
1783 | if (ack) { | |
1784 | b->ack_drops++; | |
1785 | sch->qstats.drops++; | |
1786 | b->bytes += qdisc_pkt_len(ack); | |
1787 | len -= qdisc_pkt_len(ack); | |
1788 | q->buffer_used += skb->truesize - ack->truesize; | |
1789 | if (q->rate_flags & CAKE_FLAG_INGRESS) | |
1790 | cake_advance_shaper(q, b, ack, now, true); | |
1791 | ||
1792 | qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack)); | |
1793 | consume_skb(ack); | |
1794 | } else { | |
1795 | sch->q.qlen++; | |
1796 | q->buffer_used += skb->truesize; | |
1797 | } | |
046f6fd5 | 1798 | |
0c850344 THJ |
1799 | /* stats */ |
1800 | b->packets++; | |
1801 | b->bytes += len; | |
1802 | b->backlogs[idx] += len; | |
1803 | b->tin_backlog += len; | |
1804 | sch->qstats.backlog += len; | |
1805 | q->avg_window_bytes += len; | |
1806 | } | |
046f6fd5 THJ |
1807 | |
1808 | if (q->overflow_timeout) | |
1809 | cake_heapify_up(q, b->overflow_idx[idx]); | |
1810 | ||
1811 | /* incoming bandwidth capacity estimate */ | |
7298de9c THJ |
1812 | if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { |
1813 | u64 packet_interval = \ | |
1814 | ktime_to_ns(ktime_sub(now, q->last_packet_time)); | |
1815 | ||
1816 | if (packet_interval > NSEC_PER_SEC) | |
1817 | packet_interval = NSEC_PER_SEC; | |
1818 | ||
1819 | /* filter out short-term bursts, eg. wifi aggregation */ | |
1820 | q->avg_packet_interval = \ | |
1821 | cake_ewma(q->avg_packet_interval, | |
1822 | packet_interval, | |
1823 | (packet_interval > q->avg_packet_interval ? | |
1824 | 2 : 8)); | |
1825 | ||
1826 | q->last_packet_time = now; | |
1827 | ||
1828 | if (packet_interval > q->avg_packet_interval) { | |
1829 | u64 window_interval = \ | |
1830 | ktime_to_ns(ktime_sub(now, | |
1831 | q->avg_window_begin)); | |
1832 | u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; | |
1833 | ||
68aab823 | 1834 | b = div64_u64(b, window_interval); |
7298de9c THJ |
1835 | q->avg_peak_bandwidth = |
1836 | cake_ewma(q->avg_peak_bandwidth, b, | |
1837 | b > q->avg_peak_bandwidth ? 2 : 8); | |
1838 | q->avg_window_bytes = 0; | |
1839 | q->avg_window_begin = now; | |
1840 | ||
1841 | if (ktime_after(now, | |
1842 | ktime_add_ms(q->last_reconfig_time, | |
1843 | 250))) { | |
1844 | q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; | |
1845 | cake_reconfigure(sch); | |
1846 | } | |
1847 | } | |
1848 | } else { | |
1849 | q->avg_window_bytes = 0; | |
1850 | q->last_packet_time = now; | |
1851 | } | |
046f6fd5 THJ |
1852 | |
1853 | /* flowchain */ | |
1854 | if (!flow->set || flow->set == CAKE_SET_DECAYING) { | |
1855 | struct cake_host *srchost = &b->hosts[flow->srchost]; | |
1856 | struct cake_host *dsthost = &b->hosts[flow->dsthost]; | |
1857 | u16 host_load = 1; | |
1858 | ||
1859 | if (!flow->set) { | |
1860 | list_add_tail(&flow->flowchain, &b->new_flows); | |
1861 | } else { | |
1862 | b->decaying_flow_count--; | |
1863 | list_move_tail(&flow->flowchain, &b->new_flows); | |
1864 | } | |
1865 | flow->set = CAKE_SET_SPARSE; | |
1866 | b->sparse_flow_count++; | |
1867 | ||
1868 | if (cake_dsrc(q->flow_mode)) | |
71263992 | 1869 | host_load = max(host_load, srchost->srchost_bulk_flow_count); |
046f6fd5 THJ |
1870 | |
1871 | if (cake_ddst(q->flow_mode)) | |
71263992 | 1872 | host_load = max(host_load, dsthost->dsthost_bulk_flow_count); |
046f6fd5 THJ |
1873 | |
1874 | flow->deficit = (b->flow_quantum * | |
1875 | quantum_div[host_load]) >> 16; | |
1876 | } else if (flow->set == CAKE_SET_SPARSE_WAIT) { | |
71263992 GA |
1877 | struct cake_host *srchost = &b->hosts[flow->srchost]; |
1878 | struct cake_host *dsthost = &b->hosts[flow->dsthost]; | |
1879 | ||
046f6fd5 THJ |
1880 | /* this flow was empty, accounted as a sparse flow, but actually |
1881 | * in the bulk rotation. | |
1882 | */ | |
1883 | flow->set = CAKE_SET_BULK; | |
1884 | b->sparse_flow_count--; | |
1885 | b->bulk_flow_count++; | |
71263992 GA |
1886 | |
1887 | if (cake_dsrc(q->flow_mode)) | |
1888 | srchost->srchost_bulk_flow_count++; | |
1889 | ||
1890 | if (cake_ddst(q->flow_mode)) | |
1891 | dsthost->dsthost_bulk_flow_count++; | |
1892 | ||
046f6fd5 THJ |
1893 | } |
1894 | ||
1895 | if (q->buffer_used > q->buffer_max_used) | |
1896 | q->buffer_max_used = q->buffer_used; | |
1897 | ||
1898 | if (q->buffer_used > q->buffer_limit) { | |
1899 | u32 dropped = 0; | |
1900 | ||
1901 | while (q->buffer_used > q->buffer_limit) { | |
1902 | dropped++; | |
1903 | cake_drop(sch, to_free); | |
1904 | } | |
1905 | b->drop_overlimit += dropped; | |
1906 | } | |
1907 | return NET_XMIT_SUCCESS; | |
1908 | } | |
1909 | ||
1910 | static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) | |
1911 | { | |
1912 | struct cake_sched_data *q = qdisc_priv(sch); | |
1913 | struct cake_tin_data *b = &q->tins[q->cur_tin]; | |
1914 | struct cake_flow *flow = &b->flows[q->cur_flow]; | |
1915 | struct sk_buff *skb = NULL; | |
1916 | u32 len; | |
1917 | ||
1918 | if (flow->head) { | |
1919 | skb = dequeue_head(flow); | |
1920 | len = qdisc_pkt_len(skb); | |
1921 | b->backlogs[q->cur_flow] -= len; | |
1922 | b->tin_backlog -= len; | |
1923 | sch->qstats.backlog -= len; | |
1924 | q->buffer_used -= skb->truesize; | |
1925 | sch->q.qlen--; | |
1926 | ||
1927 | if (q->overflow_timeout) | |
1928 | cake_heapify(q, b->overflow_idx[q->cur_flow]); | |
1929 | } | |
1930 | return skb; | |
1931 | } | |
1932 | ||
1933 | /* Discard leftover packets from a tin no longer in use. */ | |
1934 | static void cake_clear_tin(struct Qdisc *sch, u16 tin) | |
1935 | { | |
1936 | struct cake_sched_data *q = qdisc_priv(sch); | |
1937 | struct sk_buff *skb; | |
1938 | ||
1939 | q->cur_tin = tin; | |
1940 | for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) | |
1941 | while (!!(skb = cake_dequeue_one(sch))) | |
1942 | kfree_skb(skb); | |
1943 | } | |
1944 | ||
1945 | static struct sk_buff *cake_dequeue(struct Qdisc *sch) | |
1946 | { | |
1947 | struct cake_sched_data *q = qdisc_priv(sch); | |
1948 | struct cake_tin_data *b = &q->tins[q->cur_tin]; | |
1949 | struct cake_host *srchost, *dsthost; | |
1950 | ktime_t now = ktime_get(); | |
1951 | struct cake_flow *flow; | |
1952 | struct list_head *head; | |
1953 | bool first_flow = true; | |
1954 | struct sk_buff *skb; | |
1955 | u16 host_load; | |
1956 | u64 delay; | |
1957 | u32 len; | |
1958 | ||
1959 | begin: | |
1960 | if (!sch->q.qlen) | |
1961 | return NULL; | |
1962 | ||
1963 | /* global hard shaper */ | |
1964 | if (ktime_after(q->time_next_packet, now) && | |
1965 | ktime_after(q->failsafe_next_packet, now)) { | |
1966 | u64 next = min(ktime_to_ns(q->time_next_packet), | |
1967 | ktime_to_ns(q->failsafe_next_packet)); | |
1968 | ||
1969 | sch->qstats.overlimits++; | |
1970 | qdisc_watchdog_schedule_ns(&q->watchdog, next); | |
1971 | return NULL; | |
1972 | } | |
1973 | ||
1974 | /* Choose a class to work on. */ | |
1975 | if (!q->rate_ns) { | |
1976 | /* In unlimited mode, can't rely on shaper timings, just balance | |
1977 | * with DRR | |
1978 | */ | |
1979 | bool wrapped = false, empty = true; | |
1980 | ||
1981 | while (b->tin_deficit < 0 || | |
1982 | !(b->sparse_flow_count + b->bulk_flow_count)) { | |
1983 | if (b->tin_deficit <= 0) | |
cbd22f17 | 1984 | b->tin_deficit += b->tin_quantum; |
046f6fd5 THJ |
1985 | if (b->sparse_flow_count + b->bulk_flow_count) |
1986 | empty = false; | |
1987 | ||
1988 | q->cur_tin++; | |
1989 | b++; | |
1990 | if (q->cur_tin >= q->tin_cnt) { | |
1991 | q->cur_tin = 0; | |
1992 | b = q->tins; | |
1993 | ||
1994 | if (wrapped) { | |
1995 | /* It's possible for q->qlen to be | |
1996 | * nonzero when we actually have no | |
1997 | * packets anywhere. | |
1998 | */ | |
1999 | if (empty) | |
2000 | return NULL; | |
2001 | } else { | |
2002 | wrapped = true; | |
2003 | } | |
2004 | } | |
2005 | } | |
2006 | } else { | |
2007 | /* In shaped mode, choose: | |
2008 | * - Highest-priority tin with queue and meeting schedule, or | |
2009 | * - The earliest-scheduled tin with queue. | |
2010 | */ | |
2011 | ktime_t best_time = KTIME_MAX; | |
2012 | int tin, best_tin = 0; | |
2013 | ||
2014 | for (tin = 0; tin < q->tin_cnt; tin++) { | |
2015 | b = q->tins + tin; | |
2016 | if ((b->sparse_flow_count + b->bulk_flow_count) > 0) { | |
2017 | ktime_t time_to_pkt = \ | |
2018 | ktime_sub(b->time_next_packet, now); | |
2019 | ||
2020 | if (ktime_to_ns(time_to_pkt) <= 0 || | |
2021 | ktime_compare(time_to_pkt, | |
2022 | best_time) <= 0) { | |
2023 | best_time = time_to_pkt; | |
2024 | best_tin = tin; | |
2025 | } | |
2026 | } | |
2027 | } | |
2028 | ||
2029 | q->cur_tin = best_tin; | |
2030 | b = q->tins + best_tin; | |
2031 | ||
2032 | /* No point in going further if no packets to deliver. */ | |
2033 | if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count))) | |
2034 | return NULL; | |
2035 | } | |
2036 | ||
2037 | retry: | |
2038 | /* service this class */ | |
2039 | head = &b->decaying_flows; | |
2040 | if (!first_flow || list_empty(head)) { | |
2041 | head = &b->new_flows; | |
2042 | if (list_empty(head)) { | |
2043 | head = &b->old_flows; | |
2044 | if (unlikely(list_empty(head))) { | |
2045 | head = &b->decaying_flows; | |
2046 | if (unlikely(list_empty(head))) | |
2047 | goto begin; | |
2048 | } | |
2049 | } | |
2050 | } | |
2051 | flow = list_first_entry(head, struct cake_flow, flowchain); | |
2052 | q->cur_flow = flow - b->flows; | |
2053 | first_flow = false; | |
2054 | ||
2055 | /* triple isolation (modified DRR++) */ | |
2056 | srchost = &b->hosts[flow->srchost]; | |
2057 | dsthost = &b->hosts[flow->dsthost]; | |
2058 | host_load = 1; | |
2059 | ||
046f6fd5 THJ |
2060 | /* flow isolation (DRR++) */ |
2061 | if (flow->deficit <= 0) { | |
046f6fd5 THJ |
2062 | /* Keep all flows with deficits out of the sparse and decaying |
2063 | * rotations. No non-empty flow can go into the decaying | |
2064 | * rotation, so they can't get deficits | |
2065 | */ | |
2066 | if (flow->set == CAKE_SET_SPARSE) { | |
2067 | if (flow->head) { | |
2068 | b->sparse_flow_count--; | |
2069 | b->bulk_flow_count++; | |
71263992 GA |
2070 | |
2071 | if (cake_dsrc(q->flow_mode)) | |
2072 | srchost->srchost_bulk_flow_count++; | |
2073 | ||
2074 | if (cake_ddst(q->flow_mode)) | |
2075 | dsthost->dsthost_bulk_flow_count++; | |
2076 | ||
046f6fd5 THJ |
2077 | flow->set = CAKE_SET_BULK; |
2078 | } else { | |
2079 | /* we've moved it to the bulk rotation for | |
2080 | * correct deficit accounting but we still want | |
2081 | * to count it as a sparse flow, not a bulk one. | |
2082 | */ | |
2083 | flow->set = CAKE_SET_SPARSE_WAIT; | |
2084 | } | |
2085 | } | |
71263992 GA |
2086 | |
2087 | if (cake_dsrc(q->flow_mode)) | |
2088 | host_load = max(host_load, srchost->srchost_bulk_flow_count); | |
2089 | ||
2090 | if (cake_ddst(q->flow_mode)) | |
2091 | host_load = max(host_load, dsthost->dsthost_bulk_flow_count); | |
2092 | ||
2093 | WARN_ON(host_load > CAKE_QUEUES); | |
2094 | ||
7e3cf084 JD |
2095 | /* The get_random_u16() is a way to apply dithering to avoid |
2096 | * accumulating roundoff errors | |
71263992 GA |
2097 | */ |
2098 | flow->deficit += (b->flow_quantum * quantum_div[host_load] + | |
7e3cf084 | 2099 | get_random_u16()) >> 16; |
71263992 GA |
2100 | list_move_tail(&flow->flowchain, &b->old_flows); |
2101 | ||
046f6fd5 THJ |
2102 | goto retry; |
2103 | } | |
2104 | ||
2105 | /* Retrieve a packet via the AQM */ | |
2106 | while (1) { | |
2107 | skb = cake_dequeue_one(sch); | |
2108 | if (!skb) { | |
2109 | /* this queue was actually empty */ | |
2110 | if (cobalt_queue_empty(&flow->cvars, &b->cparams, now)) | |
2111 | b->unresponsive_flow_count--; | |
2112 | ||
2113 | if (flow->cvars.p_drop || flow->cvars.count || | |
2114 | ktime_before(now, flow->cvars.drop_next)) { | |
2115 | /* keep in the flowchain until the state has | |
2116 | * decayed to rest | |
2117 | */ | |
2118 | list_move_tail(&flow->flowchain, | |
2119 | &b->decaying_flows); | |
2120 | if (flow->set == CAKE_SET_BULK) { | |
2121 | b->bulk_flow_count--; | |
71263992 GA |
2122 | |
2123 | if (cake_dsrc(q->flow_mode)) | |
2124 | srchost->srchost_bulk_flow_count--; | |
2125 | ||
2126 | if (cake_ddst(q->flow_mode)) | |
2127 | dsthost->dsthost_bulk_flow_count--; | |
2128 | ||
046f6fd5 THJ |
2129 | b->decaying_flow_count++; |
2130 | } else if (flow->set == CAKE_SET_SPARSE || | |
2131 | flow->set == CAKE_SET_SPARSE_WAIT) { | |
2132 | b->sparse_flow_count--; | |
2133 | b->decaying_flow_count++; | |
2134 | } | |
2135 | flow->set = CAKE_SET_DECAYING; | |
2136 | } else { | |
2137 | /* remove empty queue from the flowchain */ | |
2138 | list_del_init(&flow->flowchain); | |
2139 | if (flow->set == CAKE_SET_SPARSE || | |
2140 | flow->set == CAKE_SET_SPARSE_WAIT) | |
2141 | b->sparse_flow_count--; | |
71263992 | 2142 | else if (flow->set == CAKE_SET_BULK) { |
046f6fd5 | 2143 | b->bulk_flow_count--; |
71263992 GA |
2144 | |
2145 | if (cake_dsrc(q->flow_mode)) | |
2146 | srchost->srchost_bulk_flow_count--; | |
2147 | ||
2148 | if (cake_ddst(q->flow_mode)) | |
2149 | dsthost->dsthost_bulk_flow_count--; | |
2150 | ||
2151 | } else | |
046f6fd5 THJ |
2152 | b->decaying_flow_count--; |
2153 | ||
2154 | flow->set = CAKE_SET_NONE; | |
046f6fd5 THJ |
2155 | } |
2156 | goto begin; | |
2157 | } | |
2158 | ||
2159 | /* Last packet in queue may be marked, shouldn't be dropped */ | |
7298de9c THJ |
2160 | if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, |
2161 | (b->bulk_flow_count * | |
2162 | !!(q->rate_flags & | |
2163 | CAKE_FLAG_INGRESS))) || | |
046f6fd5 THJ |
2164 | !flow->head) |
2165 | break; | |
2166 | ||
7298de9c THJ |
2167 | /* drop this packet, get another one */ |
2168 | if (q->rate_flags & CAKE_FLAG_INGRESS) { | |
2169 | len = cake_advance_shaper(q, b, skb, | |
2170 | now, true); | |
2171 | flow->deficit -= len; | |
2172 | b->tin_deficit -= len; | |
2173 | } | |
046f6fd5 THJ |
2174 | flow->dropped++; |
2175 | b->tin_dropped++; | |
2176 | qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); | |
2177 | qdisc_qstats_drop(sch); | |
2178 | kfree_skb(skb); | |
7298de9c THJ |
2179 | if (q->rate_flags & CAKE_FLAG_INGRESS) |
2180 | goto retry; | |
046f6fd5 THJ |
2181 | } |
2182 | ||
2183 | b->tin_ecn_mark += !!flow->cvars.ecn_marked; | |
2184 | qdisc_bstats_update(sch, skb); | |
2185 | ||
2186 | /* collect delay stats */ | |
2187 | delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); | |
2188 | b->avge_delay = cake_ewma(b->avge_delay, delay, 8); | |
2189 | b->peak_delay = cake_ewma(b->peak_delay, delay, | |
2190 | delay > b->peak_delay ? 2 : 8); | |
2191 | b->base_delay = cake_ewma(b->base_delay, delay, | |
2192 | delay < b->base_delay ? 2 : 8); | |
2193 | ||
2194 | len = cake_advance_shaper(q, b, skb, now, false); | |
2195 | flow->deficit -= len; | |
2196 | b->tin_deficit -= len; | |
2197 | ||
2198 | if (ktime_after(q->time_next_packet, now) && sch->q.qlen) { | |
2199 | u64 next = min(ktime_to_ns(q->time_next_packet), | |
2200 | ktime_to_ns(q->failsafe_next_packet)); | |
2201 | ||
2202 | qdisc_watchdog_schedule_ns(&q->watchdog, next); | |
2203 | } else if (!sch->q.qlen) { | |
2204 | int i; | |
2205 | ||
2206 | for (i = 0; i < q->tin_cnt; i++) { | |
2207 | if (q->tins[i].decaying_flow_count) { | |
2208 | ktime_t next = \ | |
2209 | ktime_add_ns(now, | |
2210 | q->tins[i].cparams.target); | |
2211 | ||
2212 | qdisc_watchdog_schedule_ns(&q->watchdog, | |
2213 | ktime_to_ns(next)); | |
2214 | break; | |
2215 | } | |
2216 | } | |
2217 | } | |
2218 | ||
2219 | if (q->overflow_timeout) | |
2220 | q->overflow_timeout--; | |
2221 | ||
2222 | return skb; | |
2223 | } | |
2224 | ||
2225 | static void cake_reset(struct Qdisc *sch) | |
2226 | { | |
51f9a892 | 2227 | struct cake_sched_data *q = qdisc_priv(sch); |
046f6fd5 THJ |
2228 | u32 c; |
2229 | ||
51f9a892 ZS |
2230 | if (!q->tins) |
2231 | return; | |
2232 | ||
046f6fd5 THJ |
2233 | for (c = 0; c < CAKE_MAX_TINS; c++) |
2234 | cake_clear_tin(sch, c); | |
2235 | } | |
2236 | ||
2237 | static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = { | |
2238 | [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 }, | |
2239 | [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 }, | |
2240 | [TCA_CAKE_ATM] = { .type = NLA_U32 }, | |
2241 | [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 }, | |
2242 | [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 }, | |
2243 | [TCA_CAKE_RTT] = { .type = NLA_U32 }, | |
2244 | [TCA_CAKE_TARGET] = { .type = NLA_U32 }, | |
2245 | [TCA_CAKE_AUTORATE] = { .type = NLA_U32 }, | |
2246 | [TCA_CAKE_MEMORY] = { .type = NLA_U32 }, | |
2247 | [TCA_CAKE_NAT] = { .type = NLA_U32 }, | |
2248 | [TCA_CAKE_RAW] = { .type = NLA_U32 }, | |
2249 | [TCA_CAKE_WASH] = { .type = NLA_U32 }, | |
2250 | [TCA_CAKE_MPU] = { .type = NLA_U32 }, | |
2251 | [TCA_CAKE_INGRESS] = { .type = NLA_U32 }, | |
2252 | [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 }, | |
b3c424eb | 2253 | [TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 }, |
eab2fc82 | 2254 | [TCA_CAKE_FWMARK] = { .type = NLA_U32 }, |
046f6fd5 THJ |
2255 | }; |
2256 | ||
2257 | static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, | |
2258 | u64 target_ns, u64 rtt_est_ns) | |
2259 | { | |
2260 | /* convert byte-rate into time-per-byte | |
2261 | * so it will always unwedge in reasonable time. | |
2262 | */ | |
2263 | static const u64 MIN_RATE = 64; | |
2264 | u32 byte_target = mtu; | |
2265 | u64 byte_target_ns; | |
2266 | u8 rate_shft = 0; | |
2267 | u64 rate_ns = 0; | |
2268 | ||
2269 | b->flow_quantum = 1514; | |
2270 | if (rate) { | |
2271 | b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL); | |
2272 | rate_shft = 34; | |
2273 | rate_ns = ((u64)NSEC_PER_SEC) << rate_shft; | |
2274 | rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate)); | |
2275 | while (!!(rate_ns >> 34)) { | |
2276 | rate_ns >>= 1; | |
2277 | rate_shft--; | |
2278 | } | |
2279 | } /* else unlimited, ie. zero delay */ | |
2280 | ||
2281 | b->tin_rate_bps = rate; | |
2282 | b->tin_rate_ns = rate_ns; | |
2283 | b->tin_rate_shft = rate_shft; | |
2284 | ||
2285 | byte_target_ns = (byte_target * rate_ns) >> rate_shft; | |
2286 | ||
2287 | b->cparams.target = max((byte_target_ns * 3) / 2, target_ns); | |
2288 | b->cparams.interval = max(rtt_est_ns + | |
2289 | b->cparams.target - target_ns, | |
2290 | b->cparams.target * 2); | |
2291 | b->cparams.mtu_time = byte_target_ns; | |
2292 | b->cparams.p_inc = 1 << 24; /* 1/256 */ | |
2293 | b->cparams.p_dec = 1 << 20; /* 1/4096 */ | |
2294 | } | |
2295 | ||
83f8fd69 | 2296 | static int cake_config_besteffort(struct Qdisc *sch) |
046f6fd5 THJ |
2297 | { |
2298 | struct cake_sched_data *q = qdisc_priv(sch); | |
2299 | struct cake_tin_data *b = &q->tins[0]; | |
83f8fd69 THJ |
2300 | u32 mtu = psched_mtu(qdisc_dev(sch)); |
2301 | u64 rate = q->rate_bps; | |
046f6fd5 THJ |
2302 | |
2303 | q->tin_cnt = 1; | |
83f8fd69 THJ |
2304 | |
2305 | q->tin_index = besteffort; | |
2306 | q->tin_order = normal_order; | |
2307 | ||
2308 | cake_set_rate(b, rate, mtu, | |
046f6fd5 | 2309 | us_to_ns(q->target), us_to_ns(q->interval)); |
cbd22f17 | 2310 | b->tin_quantum = 65535; |
046f6fd5 | 2311 | |
83f8fd69 THJ |
2312 | return 0; |
2313 | } | |
2314 | ||
2315 | static int cake_config_precedence(struct Qdisc *sch) | |
2316 | { | |
2317 | /* convert high-level (user visible) parameters into internal format */ | |
2318 | struct cake_sched_data *q = qdisc_priv(sch); | |
2319 | u32 mtu = psched_mtu(qdisc_dev(sch)); | |
2320 | u64 rate = q->rate_bps; | |
cbd22f17 | 2321 | u32 quantum = 256; |
83f8fd69 THJ |
2322 | u32 i; |
2323 | ||
2324 | q->tin_cnt = 8; | |
2325 | q->tin_index = precedence; | |
2326 | q->tin_order = normal_order; | |
2327 | ||
2328 | for (i = 0; i < q->tin_cnt; i++) { | |
2329 | struct cake_tin_data *b = &q->tins[i]; | |
2330 | ||
2331 | cake_set_rate(b, rate, mtu, us_to_ns(q->target), | |
2332 | us_to_ns(q->interval)); | |
2333 | ||
cbd22f17 | 2334 | b->tin_quantum = max_t(u16, 1U, quantum); |
83f8fd69 THJ |
2335 | |
2336 | /* calculate next class's parameters */ | |
2337 | rate *= 7; | |
2338 | rate >>= 3; | |
2339 | ||
cbd22f17 KDB |
2340 | quantum *= 7; |
2341 | quantum >>= 3; | |
83f8fd69 THJ |
2342 | } |
2343 | ||
2344 | return 0; | |
2345 | } | |
2346 | ||
2347 | /* List of known Diffserv codepoints: | |
2348 | * | |
c25af830 | 2349 | * Default Forwarding (DF/CS0) - Best Effort |
83f8fd69 THJ |
2350 | * Max Throughput (TOS2) |
2351 | * Min Delay (TOS4) | |
2352 | * LLT "La" (TOS5) | |
2353 | * Assured Forwarding 1 (AF1x) - x3 | |
2354 | * Assured Forwarding 2 (AF2x) - x3 | |
2355 | * Assured Forwarding 3 (AF3x) - x3 | |
2356 | * Assured Forwarding 4 (AF4x) - x3 | |
c25af830 | 2357 | * Precedence Class 1 (CS1) |
83f8fd69 THJ |
2358 | * Precedence Class 2 (CS2) |
2359 | * Precedence Class 3 (CS3) | |
2360 | * Precedence Class 4 (CS4) | |
2361 | * Precedence Class 5 (CS5) | |
2362 | * Precedence Class 6 (CS6) | |
2363 | * Precedence Class 7 (CS7) | |
2364 | * Voice Admit (VA) | |
2365 | * Expedited Forwarding (EF) | |
c25af830 KB |
2366 | * Lower Effort (LE) |
2367 | * | |
2368 | * Total 26 codepoints. | |
83f8fd69 THJ |
2369 | */ |
2370 | ||
4f667b8e | 2371 | /* List of traffic classes in RFC 4594, updated by RFC 8622: |
83f8fd69 THJ |
2372 | * (roughly descending order of contended priority) |
2373 | * (roughly ascending order of uncontended throughput) | |
2374 | * | |
2375 | * Network Control (CS6,CS7) - routing traffic | |
2376 | * Telephony (EF,VA) - aka. VoIP streams | |
2377 | * Signalling (CS5) - VoIP setup | |
2378 | * Multimedia Conferencing (AF4x) - aka. video calls | |
2379 | * Realtime Interactive (CS4) - eg. games | |
2380 | * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch | |
2381 | * Broadcast Video (CS3) | |
c25af830 KB |
2382 | * Low-Latency Data (AF2x,TOS4) - eg. database |
2383 | * Ops, Admin, Management (CS2) - eg. ssh | |
2384 | * Standard Service (DF & unrecognised codepoints) | |
2385 | * High-Throughput Data (AF1x,TOS2) - eg. web traffic | |
2386 | * Low-Priority Data (LE,CS1) - eg. BitTorrent | |
2387 | * | |
83f8fd69 THJ |
2388 | * Total 12 traffic classes. |
2389 | */ | |
2390 | ||
2391 | static int cake_config_diffserv8(struct Qdisc *sch) | |
2392 | { | |
2393 | /* Pruned list of traffic classes for typical applications: | |
2394 | * | |
2395 | * Network Control (CS6, CS7) | |
2396 | * Minimum Latency (EF, VA, CS5, CS4) | |
c25af830 | 2397 | * Interactive Shell (CS2) |
83f8fd69 THJ |
2398 | * Low Latency Transactions (AF2x, TOS4) |
2399 | * Video Streaming (AF4x, AF3x, CS3) | |
c25af830 KB |
2400 | * Bog Standard (DF etc.) |
2401 | * High Throughput (AF1x, TOS2, CS1) | |
2402 | * Background Traffic (LE) | |
83f8fd69 THJ |
2403 | * |
2404 | * Total 8 traffic classes. | |
2405 | */ | |
2406 | ||
2407 | struct cake_sched_data *q = qdisc_priv(sch); | |
2408 | u32 mtu = psched_mtu(qdisc_dev(sch)); | |
2409 | u64 rate = q->rate_bps; | |
cbd22f17 | 2410 | u32 quantum = 256; |
83f8fd69 THJ |
2411 | u32 i; |
2412 | ||
2413 | q->tin_cnt = 8; | |
2414 | ||
2415 | /* codepoint to class mapping */ | |
2416 | q->tin_index = diffserv8; | |
2417 | q->tin_order = normal_order; | |
2418 | ||
2419 | /* class characteristics */ | |
2420 | for (i = 0; i < q->tin_cnt; i++) { | |
2421 | struct cake_tin_data *b = &q->tins[i]; | |
2422 | ||
2423 | cake_set_rate(b, rate, mtu, us_to_ns(q->target), | |
2424 | us_to_ns(q->interval)); | |
2425 | ||
cbd22f17 | 2426 | b->tin_quantum = max_t(u16, 1U, quantum); |
83f8fd69 THJ |
2427 | |
2428 | /* calculate next class's parameters */ | |
2429 | rate *= 7; | |
2430 | rate >>= 3; | |
2431 | ||
cbd22f17 KDB |
2432 | quantum *= 7; |
2433 | quantum >>= 3; | |
83f8fd69 THJ |
2434 | } |
2435 | ||
2436 | return 0; | |
2437 | } | |
2438 | ||
2439 | static int cake_config_diffserv4(struct Qdisc *sch) | |
2440 | { | |
2441 | /* Further pruned list of traffic classes for four-class system: | |
2442 | * | |
2443 | * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4) | |
c25af830 KB |
2444 | * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2) |
2445 | * Best Effort (DF, AF1x, TOS2, and those not specified) | |
2446 | * Background Traffic (LE, CS1) | |
83f8fd69 THJ |
2447 | * |
2448 | * Total 4 traffic classes. | |
2449 | */ | |
2450 | ||
2451 | struct cake_sched_data *q = qdisc_priv(sch); | |
2452 | u32 mtu = psched_mtu(qdisc_dev(sch)); | |
2453 | u64 rate = q->rate_bps; | |
2454 | u32 quantum = 1024; | |
2455 | ||
2456 | q->tin_cnt = 4; | |
2457 | ||
2458 | /* codepoint to class mapping */ | |
2459 | q->tin_index = diffserv4; | |
2460 | q->tin_order = bulk_order; | |
2461 | ||
2462 | /* class characteristics */ | |
2463 | cake_set_rate(&q->tins[0], rate, mtu, | |
2464 | us_to_ns(q->target), us_to_ns(q->interval)); | |
2465 | cake_set_rate(&q->tins[1], rate >> 4, mtu, | |
2466 | us_to_ns(q->target), us_to_ns(q->interval)); | |
2467 | cake_set_rate(&q->tins[2], rate >> 1, mtu, | |
2468 | us_to_ns(q->target), us_to_ns(q->interval)); | |
2469 | cake_set_rate(&q->tins[3], rate >> 2, mtu, | |
2470 | us_to_ns(q->target), us_to_ns(q->interval)); | |
2471 | ||
83f8fd69 | 2472 | /* bandwidth-sharing weights */ |
cbd22f17 KDB |
2473 | q->tins[0].tin_quantum = quantum; |
2474 | q->tins[1].tin_quantum = quantum >> 4; | |
2475 | q->tins[2].tin_quantum = quantum >> 1; | |
2476 | q->tins[3].tin_quantum = quantum >> 2; | |
83f8fd69 THJ |
2477 | |
2478 | return 0; | |
2479 | } | |
2480 | ||
2481 | static int cake_config_diffserv3(struct Qdisc *sch) | |
2482 | { | |
2483 | /* Simplified Diffserv structure with 3 tins. | |
c25af830 | 2484 | * Latency Sensitive (CS7, CS6, EF, VA, TOS4) |
83f8fd69 | 2485 | * Best Effort |
c25af830 | 2486 | * Low Priority (LE, CS1) |
83f8fd69 THJ |
2487 | */ |
2488 | struct cake_sched_data *q = qdisc_priv(sch); | |
2489 | u32 mtu = psched_mtu(qdisc_dev(sch)); | |
2490 | u64 rate = q->rate_bps; | |
2491 | u32 quantum = 1024; | |
2492 | ||
2493 | q->tin_cnt = 3; | |
2494 | ||
2495 | /* codepoint to class mapping */ | |
2496 | q->tin_index = diffserv3; | |
2497 | q->tin_order = bulk_order; | |
2498 | ||
2499 | /* class characteristics */ | |
2500 | cake_set_rate(&q->tins[0], rate, mtu, | |
2501 | us_to_ns(q->target), us_to_ns(q->interval)); | |
2502 | cake_set_rate(&q->tins[1], rate >> 4, mtu, | |
2503 | us_to_ns(q->target), us_to_ns(q->interval)); | |
2504 | cake_set_rate(&q->tins[2], rate >> 2, mtu, | |
2505 | us_to_ns(q->target), us_to_ns(q->interval)); | |
2506 | ||
83f8fd69 | 2507 | /* bandwidth-sharing weights */ |
cbd22f17 KDB |
2508 | q->tins[0].tin_quantum = quantum; |
2509 | q->tins[1].tin_quantum = quantum >> 4; | |
2510 | q->tins[2].tin_quantum = quantum >> 2; | |
83f8fd69 THJ |
2511 | |
2512 | return 0; | |
2513 | } | |
2514 | ||
2515 | static void cake_reconfigure(struct Qdisc *sch) | |
2516 | { | |
2517 | struct cake_sched_data *q = qdisc_priv(sch); | |
2518 | int c, ft; | |
2519 | ||
2520 | switch (q->tin_mode) { | |
2521 | case CAKE_DIFFSERV_BESTEFFORT: | |
2522 | ft = cake_config_besteffort(sch); | |
2523 | break; | |
2524 | ||
2525 | case CAKE_DIFFSERV_PRECEDENCE: | |
2526 | ft = cake_config_precedence(sch); | |
2527 | break; | |
2528 | ||
2529 | case CAKE_DIFFSERV_DIFFSERV8: | |
2530 | ft = cake_config_diffserv8(sch); | |
2531 | break; | |
2532 | ||
2533 | case CAKE_DIFFSERV_DIFFSERV4: | |
2534 | ft = cake_config_diffserv4(sch); | |
2535 | break; | |
2536 | ||
2537 | case CAKE_DIFFSERV_DIFFSERV3: | |
2538 | default: | |
2539 | ft = cake_config_diffserv3(sch); | |
2540 | break; | |
2541 | } | |
2542 | ||
046f6fd5 THJ |
2543 | for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) { |
2544 | cake_clear_tin(sch, c); | |
2545 | q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time; | |
2546 | } | |
2547 | ||
2548 | q->rate_ns = q->tins[ft].tin_rate_ns; | |
2549 | q->rate_shft = q->tins[ft].tin_rate_shft; | |
2550 | ||
2551 | if (q->buffer_config_limit) { | |
2552 | q->buffer_limit = q->buffer_config_limit; | |
2553 | } else if (q->rate_bps) { | |
2554 | u64 t = q->rate_bps * q->interval; | |
2555 | ||
2556 | do_div(t, USEC_PER_SEC / 4); | |
2557 | q->buffer_limit = max_t(u32, t, 4U << 20); | |
2558 | } else { | |
2559 | q->buffer_limit = ~0; | |
2560 | } | |
2561 | ||
2562 | sch->flags &= ~TCQ_F_CAN_BYPASS; | |
2563 | ||
2564 | q->buffer_limit = min(q->buffer_limit, | |
2565 | max(sch->limit * psched_mtu(qdisc_dev(sch)), | |
2566 | q->buffer_config_limit)); | |
2567 | } | |
2568 | ||
2569 | static int cake_change(struct Qdisc *sch, struct nlattr *opt, | |
2570 | struct netlink_ext_ack *extack) | |
2571 | { | |
2572 | struct cake_sched_data *q = qdisc_priv(sch); | |
2573 | struct nlattr *tb[TCA_CAKE_MAX + 1]; | |
2574 | int err; | |
2575 | ||
8cb08174 JB |
2576 | err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy, |
2577 | extack); | |
046f6fd5 THJ |
2578 | if (err < 0) |
2579 | return err; | |
2580 | ||
ea825115 THJ |
2581 | if (tb[TCA_CAKE_NAT]) { |
2582 | #if IS_ENABLED(CONFIG_NF_CONNTRACK) | |
2583 | q->flow_mode &= ~CAKE_FLOW_NAT_FLAG; | |
2584 | q->flow_mode |= CAKE_FLOW_NAT_FLAG * | |
2585 | !!nla_get_u32(tb[TCA_CAKE_NAT]); | |
2586 | #else | |
2587 | NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], | |
2588 | "No conntrack support in kernel"); | |
2589 | return -EOPNOTSUPP; | |
2590 | #endif | |
2591 | } | |
2592 | ||
046f6fd5 THJ |
2593 | if (tb[TCA_CAKE_BASE_RATE64]) |
2594 | q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]); | |
2595 | ||
83f8fd69 THJ |
2596 | if (tb[TCA_CAKE_DIFFSERV_MODE]) |
2597 | q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]); | |
2598 | ||
2599 | if (tb[TCA_CAKE_WASH]) { | |
2600 | if (!!nla_get_u32(tb[TCA_CAKE_WASH])) | |
2601 | q->rate_flags |= CAKE_FLAG_WASH; | |
2602 | else | |
2603 | q->rate_flags &= ~CAKE_FLAG_WASH; | |
2604 | } | |
2605 | ||
046f6fd5 | 2606 | if (tb[TCA_CAKE_FLOW_MODE]) |
ea825115 THJ |
2607 | q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) | |
2608 | (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & | |
2609 | CAKE_FLOW_MASK)); | |
046f6fd5 | 2610 | |
a729b7f0 THJ |
2611 | if (tb[TCA_CAKE_ATM]) |
2612 | q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]); | |
2613 | ||
2614 | if (tb[TCA_CAKE_OVERHEAD]) { | |
2615 | q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]); | |
2616 | q->rate_flags |= CAKE_FLAG_OVERHEAD; | |
2617 | ||
2618 | q->max_netlen = 0; | |
2619 | q->max_adjlen = 0; | |
2620 | q->min_netlen = ~0; | |
2621 | q->min_adjlen = ~0; | |
2622 | } | |
2623 | ||
2624 | if (tb[TCA_CAKE_RAW]) { | |
2625 | q->rate_flags &= ~CAKE_FLAG_OVERHEAD; | |
2626 | ||
2627 | q->max_netlen = 0; | |
2628 | q->max_adjlen = 0; | |
2629 | q->min_netlen = ~0; | |
2630 | q->min_adjlen = ~0; | |
2631 | } | |
2632 | ||
2633 | if (tb[TCA_CAKE_MPU]) | |
2634 | q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]); | |
2635 | ||
046f6fd5 THJ |
2636 | if (tb[TCA_CAKE_RTT]) { |
2637 | q->interval = nla_get_u32(tb[TCA_CAKE_RTT]); | |
2638 | ||
2639 | if (!q->interval) | |
2640 | q->interval = 1; | |
2641 | } | |
2642 | ||
2643 | if (tb[TCA_CAKE_TARGET]) { | |
2644 | q->target = nla_get_u32(tb[TCA_CAKE_TARGET]); | |
2645 | ||
2646 | if (!q->target) | |
2647 | q->target = 1; | |
2648 | } | |
2649 | ||
7298de9c THJ |
2650 | if (tb[TCA_CAKE_AUTORATE]) { |
2651 | if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) | |
2652 | q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; | |
2653 | else | |
2654 | q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; | |
2655 | } | |
2656 | ||
2657 | if (tb[TCA_CAKE_INGRESS]) { | |
2658 | if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) | |
2659 | q->rate_flags |= CAKE_FLAG_INGRESS; | |
2660 | else | |
2661 | q->rate_flags &= ~CAKE_FLAG_INGRESS; | |
2662 | } | |
2663 | ||
8b713881 THJ |
2664 | if (tb[TCA_CAKE_ACK_FILTER]) |
2665 | q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]); | |
2666 | ||
046f6fd5 THJ |
2667 | if (tb[TCA_CAKE_MEMORY]) |
2668 | q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); | |
2669 | ||
2db6dc26 DT |
2670 | if (tb[TCA_CAKE_SPLIT_GSO]) { |
2671 | if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO])) | |
2672 | q->rate_flags |= CAKE_FLAG_SPLIT_GSO; | |
2673 | else | |
2674 | q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; | |
2675 | } | |
0c850344 | 2676 | |
0b5c7efd | 2677 | if (tb[TCA_CAKE_FWMARK]) { |
eab2fc82 THJ |
2678 | q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]); |
2679 | q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0; | |
0b5c7efd KDB |
2680 | } |
2681 | ||
046f6fd5 THJ |
2682 | if (q->tins) { |
2683 | sch_tree_lock(sch); | |
2684 | cake_reconfigure(sch); | |
2685 | sch_tree_unlock(sch); | |
2686 | } | |
2687 | ||
2688 | return 0; | |
2689 | } | |
2690 | ||
2691 | static void cake_destroy(struct Qdisc *sch) | |
2692 | { | |
2693 | struct cake_sched_data *q = qdisc_priv(sch); | |
2694 | ||
2695 | qdisc_watchdog_cancel(&q->watchdog); | |
2696 | tcf_block_put(q->block); | |
2697 | kvfree(q->tins); | |
2698 | } | |
2699 | ||
2700 | static int cake_init(struct Qdisc *sch, struct nlattr *opt, | |
2701 | struct netlink_ext_ack *extack) | |
2702 | { | |
2703 | struct cake_sched_data *q = qdisc_priv(sch); | |
2704 | int i, j, err; | |
2705 | ||
2706 | sch->limit = 10240; | |
83f8fd69 | 2707 | q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; |
046f6fd5 THJ |
2708 | q->flow_mode = CAKE_FLOW_TRIPLE; |
2709 | ||
2710 | q->rate_bps = 0; /* unlimited by default */ | |
2711 | ||
2712 | q->interval = 100000; /* 100ms default */ | |
2713 | q->target = 5000; /* 5ms: codel RFC argues | |
2714 | * for 5 to 10% of interval | |
2715 | */ | |
2db6dc26 | 2716 | q->rate_flags |= CAKE_FLAG_SPLIT_GSO; |
046f6fd5 THJ |
2717 | q->cur_tin = 0; |
2718 | q->cur_flow = 0; | |
2719 | ||
2720 | qdisc_watchdog_init(&q->watchdog, sch); | |
2721 | ||
2722 | if (opt) { | |
3f608f0c | 2723 | err = cake_change(sch, opt, extack); |
046f6fd5 THJ |
2724 | |
2725 | if (err) | |
2726 | return err; | |
2727 | } | |
2728 | ||
2729 | err = tcf_block_get(&q->block, &q->filter_list, sch, extack); | |
2730 | if (err) | |
2731 | return err; | |
2732 | ||
2733 | quantum_div[0] = ~0; | |
2734 | for (i = 1; i <= CAKE_QUEUES; i++) | |
2735 | quantum_div[i] = 65535 / i; | |
2736 | ||
329e0989 | 2737 | q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data), |
046f6fd5 THJ |
2738 | GFP_KERNEL); |
2739 | if (!q->tins) | |
ab443c53 | 2740 | return -ENOMEM; |
046f6fd5 THJ |
2741 | |
2742 | for (i = 0; i < CAKE_MAX_TINS; i++) { | |
2743 | struct cake_tin_data *b = q->tins + i; | |
2744 | ||
2745 | INIT_LIST_HEAD(&b->new_flows); | |
2746 | INIT_LIST_HEAD(&b->old_flows); | |
2747 | INIT_LIST_HEAD(&b->decaying_flows); | |
2748 | b->sparse_flow_count = 0; | |
2749 | b->bulk_flow_count = 0; | |
2750 | b->decaying_flow_count = 0; | |
2751 | ||
2752 | for (j = 0; j < CAKE_QUEUES; j++) { | |
2753 | struct cake_flow *flow = b->flows + j; | |
2754 | u32 k = j * CAKE_MAX_TINS + i; | |
2755 | ||
2756 | INIT_LIST_HEAD(&flow->flowchain); | |
2757 | cobalt_vars_init(&flow->cvars); | |
2758 | ||
2759 | q->overflow_heap[k].t = i; | |
2760 | q->overflow_heap[k].b = j; | |
2761 | b->overflow_idx[j] = k; | |
2762 | } | |
2763 | } | |
2764 | ||
2765 | cake_reconfigure(sch); | |
2766 | q->avg_peak_bandwidth = q->rate_bps; | |
2767 | q->min_netlen = ~0; | |
2768 | q->min_adjlen = ~0; | |
2769 | return 0; | |
046f6fd5 THJ |
2770 | } |
2771 | ||
2772 | static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) | |
2773 | { | |
2774 | struct cake_sched_data *q = qdisc_priv(sch); | |
2775 | struct nlattr *opts; | |
2776 | ||
ae0be8de | 2777 | opts = nla_nest_start_noflag(skb, TCA_OPTIONS); |
046f6fd5 THJ |
2778 | if (!opts) |
2779 | goto nla_put_failure; | |
2780 | ||
2781 | if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps, | |
2782 | TCA_CAKE_PAD)) | |
2783 | goto nla_put_failure; | |
2784 | ||
2785 | if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, | |
2786 | q->flow_mode & CAKE_FLOW_MASK)) | |
2787 | goto nla_put_failure; | |
2788 | ||
2789 | if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval)) | |
2790 | goto nla_put_failure; | |
2791 | ||
2792 | if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target)) | |
2793 | goto nla_put_failure; | |
2794 | ||
2795 | if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit)) | |
2796 | goto nla_put_failure; | |
2797 | ||
7298de9c THJ |
2798 | if (nla_put_u32(skb, TCA_CAKE_AUTORATE, |
2799 | !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) | |
2800 | goto nla_put_failure; | |
2801 | ||
2802 | if (nla_put_u32(skb, TCA_CAKE_INGRESS, | |
2803 | !!(q->rate_flags & CAKE_FLAG_INGRESS))) | |
2804 | goto nla_put_failure; | |
2805 | ||
8b713881 THJ |
2806 | if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter)) |
2807 | goto nla_put_failure; | |
2808 | ||
ea825115 THJ |
2809 | if (nla_put_u32(skb, TCA_CAKE_NAT, |
2810 | !!(q->flow_mode & CAKE_FLOW_NAT_FLAG))) | |
2811 | goto nla_put_failure; | |
2812 | ||
83f8fd69 THJ |
2813 | if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode)) |
2814 | goto nla_put_failure; | |
2815 | ||
2816 | if (nla_put_u32(skb, TCA_CAKE_WASH, | |
2817 | !!(q->rate_flags & CAKE_FLAG_WASH))) | |
2818 | goto nla_put_failure; | |
a729b7f0 THJ |
2819 | |
2820 | if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead)) | |
2821 | goto nla_put_failure; | |
2822 | ||
2823 | if (!(q->rate_flags & CAKE_FLAG_OVERHEAD)) | |
2824 | if (nla_put_u32(skb, TCA_CAKE_RAW, 0)) | |
2825 | goto nla_put_failure; | |
2826 | ||
2827 | if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode)) | |
2828 | goto nla_put_failure; | |
2829 | ||
2830 | if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu)) | |
2831 | goto nla_put_failure; | |
0c850344 THJ |
2832 | |
2833 | if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, | |
2834 | !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO))) | |
2835 | goto nla_put_failure; | |
0b5c7efd | 2836 | |
eab2fc82 | 2837 | if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask)) |
0b5c7efd | 2838 | goto nla_put_failure; |
83f8fd69 | 2839 | |
046f6fd5 THJ |
2840 | return nla_nest_end(skb, opts); |
2841 | ||
2842 | nla_put_failure: | |
2843 | return -1; | |
2844 | } | |
2845 | ||
2846 | static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) | |
2847 | { | |
ae0be8de | 2848 | struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); |
046f6fd5 THJ |
2849 | struct cake_sched_data *q = qdisc_priv(sch); |
2850 | struct nlattr *tstats, *ts; | |
2851 | int i; | |
2852 | ||
2853 | if (!stats) | |
2854 | return -1; | |
2855 | ||
2856 | #define PUT_STAT_U32(attr, data) do { \ | |
2857 | if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ | |
2858 | goto nla_put_failure; \ | |
2859 | } while (0) | |
2860 | #define PUT_STAT_U64(attr, data) do { \ | |
2861 | if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \ | |
2862 | data, TCA_CAKE_STATS_PAD)) \ | |
2863 | goto nla_put_failure; \ | |
2864 | } while (0) | |
2865 | ||
2866 | PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth); | |
2867 | PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit); | |
2868 | PUT_STAT_U32(MEMORY_USED, q->buffer_max_used); | |
2869 | PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16)); | |
2870 | PUT_STAT_U32(MAX_NETLEN, q->max_netlen); | |
2871 | PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen); | |
2872 | PUT_STAT_U32(MIN_NETLEN, q->min_netlen); | |
2873 | PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen); | |
2874 | ||
2875 | #undef PUT_STAT_U32 | |
2876 | #undef PUT_STAT_U64 | |
2877 | ||
ae0be8de | 2878 | tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS); |
046f6fd5 THJ |
2879 | if (!tstats) |
2880 | goto nla_put_failure; | |
2881 | ||
2882 | #define PUT_TSTAT_U32(attr, data) do { \ | |
2883 | if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \ | |
2884 | goto nla_put_failure; \ | |
2885 | } while (0) | |
2886 | #define PUT_TSTAT_U64(attr, data) do { \ | |
2887 | if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \ | |
2888 | data, TCA_CAKE_TIN_STATS_PAD)) \ | |
2889 | goto nla_put_failure; \ | |
2890 | } while (0) | |
2891 | ||
2892 | for (i = 0; i < q->tin_cnt; i++) { | |
83f8fd69 | 2893 | struct cake_tin_data *b = &q->tins[q->tin_order[i]]; |
046f6fd5 | 2894 | |
ae0be8de | 2895 | ts = nla_nest_start_noflag(d->skb, i + 1); |
046f6fd5 THJ |
2896 | if (!ts) |
2897 | goto nla_put_failure; | |
2898 | ||
2899 | PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps); | |
2900 | PUT_TSTAT_U64(SENT_BYTES64, b->bytes); | |
2901 | PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog); | |
2902 | ||
2903 | PUT_TSTAT_U32(TARGET_US, | |
2904 | ktime_to_us(ns_to_ktime(b->cparams.target))); | |
2905 | PUT_TSTAT_U32(INTERVAL_US, | |
2906 | ktime_to_us(ns_to_ktime(b->cparams.interval))); | |
2907 | ||
2908 | PUT_TSTAT_U32(SENT_PACKETS, b->packets); | |
2909 | PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped); | |
2910 | PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark); | |
2911 | PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops); | |
2912 | ||
2913 | PUT_TSTAT_U32(PEAK_DELAY_US, | |
2914 | ktime_to_us(ns_to_ktime(b->peak_delay))); | |
2915 | PUT_TSTAT_U32(AVG_DELAY_US, | |
2916 | ktime_to_us(ns_to_ktime(b->avge_delay))); | |
2917 | PUT_TSTAT_U32(BASE_DELAY_US, | |
2918 | ktime_to_us(ns_to_ktime(b->base_delay))); | |
2919 | ||
2920 | PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits); | |
2921 | PUT_TSTAT_U32(WAY_MISSES, b->way_misses); | |
2922 | PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions); | |
2923 | ||
2924 | PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count + | |
2925 | b->decaying_flow_count); | |
2926 | PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count); | |
2927 | PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count); | |
2928 | PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen); | |
2929 | ||
2930 | PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum); | |
2931 | nla_nest_end(d->skb, ts); | |
2932 | } | |
2933 | ||
2934 | #undef PUT_TSTAT_U32 | |
2935 | #undef PUT_TSTAT_U64 | |
2936 | ||
2937 | nla_nest_end(d->skb, tstats); | |
2938 | return nla_nest_end(d->skb, stats); | |
2939 | ||
2940 | nla_put_failure: | |
2941 | nla_nest_cancel(d->skb, stats); | |
2942 | return -1; | |
2943 | } | |
2944 | ||
2945 | static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg) | |
2946 | { | |
2947 | return NULL; | |
2948 | } | |
2949 | ||
2950 | static unsigned long cake_find(struct Qdisc *sch, u32 classid) | |
2951 | { | |
2952 | return 0; | |
2953 | } | |
2954 | ||
2955 | static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent, | |
2956 | u32 classid) | |
2957 | { | |
2958 | return 0; | |
2959 | } | |
2960 | ||
2961 | static void cake_unbind(struct Qdisc *q, unsigned long cl) | |
2962 | { | |
2963 | } | |
2964 | ||
2965 | static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl, | |
2966 | struct netlink_ext_ack *extack) | |
2967 | { | |
2968 | struct cake_sched_data *q = qdisc_priv(sch); | |
2969 | ||
2970 | if (cl) | |
2971 | return NULL; | |
2972 | return q->block; | |
2973 | } | |
2974 | ||
2975 | static int cake_dump_class(struct Qdisc *sch, unsigned long cl, | |
2976 | struct sk_buff *skb, struct tcmsg *tcm) | |
2977 | { | |
2978 | tcm->tcm_handle |= TC_H_MIN(cl); | |
2979 | return 0; | |
2980 | } | |
2981 | ||
2982 | static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, | |
2983 | struct gnet_dump *d) | |
2984 | { | |
2985 | struct cake_sched_data *q = qdisc_priv(sch); | |
2986 | const struct cake_flow *flow = NULL; | |
2987 | struct gnet_stats_queue qs = { 0 }; | |
2988 | struct nlattr *stats; | |
2989 | u32 idx = cl - 1; | |
2990 | ||
2991 | if (idx < CAKE_QUEUES * q->tin_cnt) { | |
83f8fd69 THJ |
2992 | const struct cake_tin_data *b = \ |
2993 | &q->tins[q->tin_order[idx / CAKE_QUEUES]]; | |
046f6fd5 THJ |
2994 | const struct sk_buff *skb; |
2995 | ||
2996 | flow = &b->flows[idx % CAKE_QUEUES]; | |
2997 | ||
2998 | if (flow->head) { | |
2999 | sch_tree_lock(sch); | |
3000 | skb = flow->head; | |
3001 | while (skb) { | |
3002 | qs.qlen++; | |
3003 | skb = skb->next; | |
3004 | } | |
3005 | sch_tree_unlock(sch); | |
3006 | } | |
3007 | qs.backlog = b->backlogs[idx % CAKE_QUEUES]; | |
3008 | qs.drops = flow->dropped; | |
3009 | } | |
3010 | if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) | |
3011 | return -1; | |
3012 | if (flow) { | |
3013 | ktime_t now = ktime_get(); | |
3014 | ||
ae0be8de | 3015 | stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); |
046f6fd5 THJ |
3016 | if (!stats) |
3017 | return -1; | |
3018 | ||
3019 | #define PUT_STAT_U32(attr, data) do { \ | |
3020 | if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ | |
3021 | goto nla_put_failure; \ | |
3022 | } while (0) | |
3023 | #define PUT_STAT_S32(attr, data) do { \ | |
3024 | if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ | |
3025 | goto nla_put_failure; \ | |
3026 | } while (0) | |
3027 | ||
3028 | PUT_STAT_S32(DEFICIT, flow->deficit); | |
3029 | PUT_STAT_U32(DROPPING, flow->cvars.dropping); | |
3030 | PUT_STAT_U32(COBALT_COUNT, flow->cvars.count); | |
3031 | PUT_STAT_U32(P_DROP, flow->cvars.p_drop); | |
3032 | if (flow->cvars.p_drop) { | |
3033 | PUT_STAT_S32(BLUE_TIMER_US, | |
3034 | ktime_to_us( | |
3035 | ktime_sub(now, | |
3f608f0c | 3036 | flow->cvars.blue_timer))); |
046f6fd5 THJ |
3037 | } |
3038 | if (flow->cvars.dropping) { | |
3039 | PUT_STAT_S32(DROP_NEXT_US, | |
3040 | ktime_to_us( | |
3041 | ktime_sub(now, | |
3042 | flow->cvars.drop_next))); | |
3043 | } | |
3044 | ||
3045 | if (nla_nest_end(d->skb, stats) < 0) | |
3046 | return -1; | |
3047 | } | |
3048 | ||
3049 | return 0; | |
3050 | ||
3051 | nla_put_failure: | |
3052 | nla_nest_cancel(d->skb, stats); | |
3053 | return -1; | |
3054 | } | |
3055 | ||
3056 | static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) | |
3057 | { | |
3058 | struct cake_sched_data *q = qdisc_priv(sch); | |
3059 | unsigned int i, j; | |
3060 | ||
3061 | if (arg->stop) | |
3062 | return; | |
3063 | ||
3064 | for (i = 0; i < q->tin_cnt; i++) { | |
83f8fd69 | 3065 | struct cake_tin_data *b = &q->tins[q->tin_order[i]]; |
046f6fd5 THJ |
3066 | |
3067 | for (j = 0; j < CAKE_QUEUES; j++) { | |
e046fa89 | 3068 | if (list_empty(&b->flows[j].flowchain)) { |
046f6fd5 THJ |
3069 | arg->count++; |
3070 | continue; | |
3071 | } | |
e046fa89 ZS |
3072 | if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1, |
3073 | arg)) | |
046f6fd5 | 3074 | break; |
046f6fd5 THJ |
3075 | } |
3076 | } | |
3077 | } | |
3078 | ||
3079 | static const struct Qdisc_class_ops cake_class_ops = { | |
3080 | .leaf = cake_leaf, | |
3081 | .find = cake_find, | |
3082 | .tcf_block = cake_tcf_block, | |
3083 | .bind_tcf = cake_bind, | |
3084 | .unbind_tcf = cake_unbind, | |
3085 | .dump = cake_dump_class, | |
3086 | .dump_stats = cake_dump_class_stats, | |
3087 | .walk = cake_walk, | |
3088 | }; | |
3089 | ||
3090 | static struct Qdisc_ops cake_qdisc_ops __read_mostly = { | |
3091 | .cl_ops = &cake_class_ops, | |
3092 | .id = "cake", | |
3093 | .priv_size = sizeof(struct cake_sched_data), | |
3094 | .enqueue = cake_enqueue, | |
3095 | .dequeue = cake_dequeue, | |
3096 | .peek = qdisc_peek_dequeued, | |
3097 | .init = cake_init, | |
3098 | .reset = cake_reset, | |
3099 | .destroy = cake_destroy, | |
3100 | .change = cake_change, | |
3101 | .dump = cake_dump, | |
3102 | .dump_stats = cake_dump_stats, | |
3103 | .owner = THIS_MODULE, | |
3104 | }; | |
3105 | ||
3106 | static int __init cake_module_init(void) | |
3107 | { | |
3108 | return register_qdisc(&cake_qdisc_ops); | |
3109 | } | |
3110 | ||
3111 | static void __exit cake_module_exit(void) | |
3112 | { | |
3113 | unregister_qdisc(&cake_qdisc_ops); | |
3114 | } | |
3115 | ||
3116 | module_init(cake_module_init) | |
3117 | module_exit(cake_module_exit) | |
3118 | MODULE_AUTHOR("Jonathan Morton"); | |
3119 | MODULE_LICENSE("Dual BSD/GPL"); | |
3120 | MODULE_DESCRIPTION("The CAKE shaper."); |