Commit | Line | Data |
---|---|---|
3c4287f6 SB |
1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | ||
3 | /* PIPAPO: PIle PAcket POlicies: set for arbitrary concatenations of ranges | |
4 | * | |
5 | * Copyright (c) 2019-2020 Red Hat GmbH | |
6 | * | |
7 | * Author: Stefano Brivio <sbrivio@redhat.com> | |
8 | */ | |
9 | ||
10 | /** | |
11 | * DOC: Theory of Operation | |
12 | * | |
13 | * | |
14 | * Problem | |
15 | * ------- | |
16 | * | |
17 | * Match packet bytes against entries composed of ranged or non-ranged packet | |
18 | * field specifiers, mapping them to arbitrary references. For example: | |
19 | * | |
20 | * :: | |
21 | * | |
22 | * --- fields ---> | |
23 | * | [net],[port],[net]... => [reference] | |
24 | * entries [net],[port],[net]... => [reference] | |
25 | * | [net],[port],[net]... => [reference] | |
26 | * V ... | |
27 | * | |
28 | * where [net] fields can be IP ranges or netmasks, and [port] fields are port | |
29 | * ranges. Arbitrary packet fields can be matched. | |
30 | * | |
31 | * | |
32 | * Algorithm Overview | |
33 | * ------------------ | |
34 | * | |
35 | * This algorithm is loosely inspired by [Ligatti 2010], and fundamentally | |
36 | * relies on the consideration that every contiguous range in a space of b bits | |
37 | * can be converted into b * 2 netmasks, from Theorem 3 in [Rottenstreich 2010], | |
38 | * as also illustrated in Section 9 of [Kogan 2014]. | |
39 | * | |
40 | * Classification against a number of entries, that require matching given bits | |
41 | * of a packet field, is performed by grouping those bits in sets of arbitrary | |
42 | * size, and classifying packet bits one group at a time. | |
43 | * | |
44 | * Example: | |
45 | * to match the source port (16 bits) of a packet, we can divide those 16 bits | |
46 | * in 4 groups of 4 bits each. Given the entry: | |
47 | * 0000 0001 0101 1001 | |
48 | * and a packet with source port: | |
49 | * 0000 0001 1010 1001 | |
50 | * first and second groups match, but the third doesn't. We conclude that the | |
51 | * packet doesn't match the given entry. | |
52 | * | |
53 | * Translate the set to a sequence of lookup tables, one per field. Each table | |
54 | * has two dimensions: bit groups to be matched for a single packet field, and | |
55 | * all the possible values of said groups (buckets). Input entries are | |
56 | * represented as one or more rules, depending on the number of composing | |
57 | * netmasks for the given field specifier, and a group match is indicated as a | |
58 | * set bit, with number corresponding to the rule index, in all the buckets | |
59 | * whose value matches the entry for a given group. | |
60 | * | |
61 | * Rules are mapped between fields through an array of x, n pairs, with each | |
62 | * item mapping a matched rule to one or more rules. The position of the pair in | |
63 | * the array indicates the matched rule to be mapped to the next field, x | |
64 | * indicates the first rule index in the next field, and n the amount of | |
65 | * next-field rules the current rule maps to. | |
66 | * | |
67 | * The mapping array for the last field maps to the desired references. | |
68 | * | |
69 | * To match, we perform table lookups using the values of grouped packet bits, | |
70 | * and use a sequence of bitwise operations to progressively evaluate rule | |
71 | * matching. | |
72 | * | |
73 | * A stand-alone, reference implementation, also including notes about possible | |
74 | * future optimisations, is available at: | |
75 | * https://pipapo.lameexcu.se/ | |
76 | * | |
77 | * Insertion | |
78 | * --------- | |
79 | * | |
80 | * - For each packet field: | |
81 | * | |
82 | * - divide the b packet bits we want to classify into groups of size t, | |
83 | * obtaining ceil(b / t) groups | |
84 | * | |
85 | * Example: match on destination IP address, with t = 4: 32 bits, 8 groups | |
86 | * of 4 bits each | |
87 | * | |
88 | * - allocate a lookup table with one column ("bucket") for each possible | |
89 | * value of a group, and with one row for each group | |
90 | * | |
91 | * Example: 8 groups, 2^4 buckets: | |
92 | * | |
93 | * :: | |
94 | * | |
95 | * bucket | |
96 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
97 | * 0 | |
98 | * 1 | |
99 | * 2 | |
100 | * 3 | |
101 | * 4 | |
102 | * 5 | |
103 | * 6 | |
104 | * 7 | |
105 | * | |
106 | * - map the bits we want to classify for the current field, for a given | |
107 | * entry, to a single rule for non-ranged and netmask set items, and to one | |
108 | * or multiple rules for ranges. Ranges are expanded to composing netmasks | |
109 | * by pipapo_expand(). | |
110 | * | |
111 | * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048 | |
112 | * - rule #0: 10.0.0.5 | |
113 | * - rule #1: 192.168.1.0/24 | |
114 | * - rule #2: 192.168.2.0/31 | |
115 | * | |
116 | * - insert references to the rules in the lookup table, selecting buckets | |
117 | * according to bit values of a rule in the given group. This is done by | |
118 | * pipapo_insert(). | |
119 | * | |
120 | * Example: given: | |
121 | * - rule #0: 10.0.0.5 mapping to buckets | |
122 | * < 0 10 0 0 0 0 0 5 > | |
123 | * - rule #1: 192.168.1.0/24 mapping to buckets | |
124 | * < 12 0 10 8 0 1 < 0..15 > < 0..15 > > | |
125 | * - rule #2: 192.168.2.0/31 mapping to buckets | |
126 | * < 12 0 10 8 0 2 0 < 0..1 > > | |
127 | * | |
128 | * these bits are set in the lookup table: | |
129 | * | |
130 | * :: | |
131 | * | |
132 | * bucket | |
133 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
134 | * 0 0 1,2 | |
135 | * 1 1,2 0 | |
136 | * 2 0 1,2 | |
137 | * 3 0 1,2 | |
138 | * 4 0,1,2 | |
139 | * 5 0 1 2 | |
140 | * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | |
141 | * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 | |
142 | * | |
143 | * - if this is not the last field in the set, fill a mapping array that maps | |
144 | * rules from the lookup table to rules belonging to the same entry in | |
145 | * the next lookup table, done by pipapo_map(). | |
146 | * | |
147 | * Note that as rules map to contiguous ranges of rules, given how netmask | |
148 | * expansion and insertion is performed, &union nft_pipapo_map_bucket stores | |
149 | * this information as pairs of first rule index, rule count. | |
150 | * | |
151 | * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048, | |
152 | * given lookup table #0 for field 0 (see example above): | |
153 | * | |
154 | * :: | |
155 | * | |
156 | * bucket | |
157 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
158 | * 0 0 1,2 | |
159 | * 1 1,2 0 | |
160 | * 2 0 1,2 | |
161 | * 3 0 1,2 | |
162 | * 4 0,1,2 | |
163 | * 5 0 1 2 | |
164 | * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | |
165 | * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 | |
166 | * | |
167 | * and lookup table #1 for field 1 with: | |
168 | * - rule #0: 1024 mapping to buckets | |
169 | * < 0 0 4 0 > | |
170 | * - rule #1: 2048 mapping to buckets | |
171 | * < 0 0 5 0 > | |
172 | * | |
173 | * :: | |
174 | * | |
175 | * bucket | |
176 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
177 | * 0 0,1 | |
178 | * 1 0,1 | |
179 | * 2 0 1 | |
180 | * 3 0,1 | |
181 | * | |
182 | * we need to map rules for 10.0.0.5 in lookup table #0 (rule #0) to 1024 | |
183 | * in lookup table #1 (rule #0) and rules for 192.168.1.0-192.168.2.1 | |
184 | * (rules #1, #2) to 2048 in lookup table #2 (rule #1): | |
185 | * | |
186 | * :: | |
187 | * | |
188 | * rule indices in current field: 0 1 2 | |
189 | * map to rules in next field: 0 1 1 | |
190 | * | |
191 | * - if this is the last field in the set, fill a mapping array that maps | |
192 | * rules from the last lookup table to element pointers, also done by | |
193 | * pipapo_map(). | |
194 | * | |
195 | * Note that, in this implementation, we have two elements (start, end) for | |
196 | * each entry. The pointer to the end element is stored in this array, and | |
197 | * the pointer to the start element is linked from it. | |
198 | * | |
199 | * Example: entry 10.0.0.5:1024 has a corresponding &struct nft_pipapo_elem | |
200 | * pointer, 0x66, and element for 192.168.1.0-192.168.2.1:2048 is at 0x42. | |
201 | * From the rules of lookup table #1 as mapped above: | |
202 | * | |
203 | * :: | |
204 | * | |
205 | * rule indices in last field: 0 1 | |
bd97ad51 | 206 | * map to elements: 0x66 0x42 |
3c4287f6 SB |
207 | * |
208 | * | |
209 | * Matching | |
210 | * -------- | |
211 | * | |
212 | * We use a result bitmap, with the size of a single lookup table bucket, to | |
213 | * represent the matching state that applies at every algorithm step. This is | |
214 | * done by pipapo_lookup(). | |
215 | * | |
216 | * - For each packet field: | |
217 | * | |
218 | * - start with an all-ones result bitmap (res_map in pipapo_lookup()) | |
219 | * | |
220 | * - perform a lookup into the table corresponding to the current field, | |
221 | * for each group, and at every group, AND the current result bitmap with | |
222 | * the value from the lookup table bucket | |
223 | * | |
224 | * :: | |
225 | * | |
226 | * Example: 192.168.1.5 < 12 0 10 8 0 1 0 5 >, with lookup table from | |
227 | * insertion examples. | |
228 | * Lookup table buckets are at least 3 bits wide, we'll assume 8 bits for | |
229 | * convenience in this example. Initial result bitmap is 0xff, the steps | |
230 | * below show the value of the result bitmap after each group is processed: | |
231 | * | |
232 | * bucket | |
233 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
234 | * 0 0 1,2 | |
235 | * result bitmap is now: 0xff & 0x6 [bucket 12] = 0x6 | |
236 | * | |
237 | * 1 1,2 0 | |
238 | * result bitmap is now: 0x6 & 0x6 [bucket 0] = 0x6 | |
239 | * | |
240 | * 2 0 1,2 | |
241 | * result bitmap is now: 0x6 & 0x6 [bucket 10] = 0x6 | |
242 | * | |
243 | * 3 0 1,2 | |
244 | * result bitmap is now: 0x6 & 0x6 [bucket 8] = 0x6 | |
245 | * | |
246 | * 4 0,1,2 | |
247 | * result bitmap is now: 0x6 & 0x7 [bucket 0] = 0x6 | |
248 | * | |
249 | * 5 0 1 2 | |
250 | * result bitmap is now: 0x6 & 0x2 [bucket 1] = 0x2 | |
251 | * | |
252 | * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | |
253 | * result bitmap is now: 0x2 & 0x7 [bucket 0] = 0x2 | |
254 | * | |
255 | * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 | |
256 | * final result bitmap for this field is: 0x2 & 0x3 [bucket 5] = 0x2 | |
257 | * | |
258 | * - at the next field, start with a new, all-zeroes result bitmap. For each | |
259 | * bit set in the previous result bitmap, fill the new result bitmap | |
260 | * (fill_map in pipapo_lookup()) with the rule indices from the | |
261 | * corresponding buckets of the mapping field for this field, done by | |
262 | * pipapo_refill() | |
263 | * | |
264 | * Example: with mapping table from insertion examples, with the current | |
265 | * result bitmap from the previous example, 0x02: | |
266 | * | |
267 | * :: | |
268 | * | |
269 | * rule indices in current field: 0 1 2 | |
270 | * map to rules in next field: 0 1 1 | |
271 | * | |
272 | * the new result bitmap will be 0x02: rule 1 was set, and rule 1 will be | |
273 | * set. | |
274 | * | |
275 | * We can now extend this example to cover the second iteration of the step | |
276 | * above (lookup and AND bitmap): assuming the port field is | |
277 | * 2048 < 0 0 5 0 >, with starting result bitmap 0x2, and lookup table | |
278 | * for "port" field from pre-computation example: | |
279 | * | |
280 | * :: | |
281 | * | |
282 | * bucket | |
283 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
284 | * 0 0,1 | |
285 | * 1 0,1 | |
286 | * 2 0 1 | |
287 | * 3 0,1 | |
288 | * | |
289 | * operations are: 0x2 & 0x3 [bucket 0] & 0x3 [bucket 0] & 0x2 [bucket 5] | |
290 | * & 0x3 [bucket 0], resulting bitmap is 0x2. | |
291 | * | |
292 | * - if this is the last field in the set, look up the value from the mapping | |
293 | * array corresponding to the final result bitmap | |
294 | * | |
295 | * Example: 0x2 resulting bitmap from 192.168.1.5:2048, mapping array for | |
296 | * last field from insertion example: | |
297 | * | |
298 | * :: | |
299 | * | |
300 | * rule indices in last field: 0 1 | |
bd97ad51 | 301 | * map to elements: 0x66 0x42 |
3c4287f6 SB |
302 | * |
303 | * the matching element is at 0x42. | |
304 | * | |
305 | * | |
306 | * References | |
307 | * ---------- | |
308 | * | |
309 | * [Ligatti 2010] | |
310 | * A Packet-classification Algorithm for Arbitrary Bitmask Rules, with | |
311 | * Automatic Time-space Tradeoffs | |
312 | * Jay Ligatti, Josh Kuhn, and Chris Gage. | |
313 | * Proceedings of the IEEE International Conference on Computer | |
314 | * Communication Networks (ICCCN), August 2010. | |
50935339 | 315 | * https://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf |
3c4287f6 SB |
316 | * |
317 | * [Rottenstreich 2010] | |
318 | * Worst-Case TCAM Rule Expansion | |
319 | * Ori Rottenstreich and Isaac Keslassy. | |
320 | * 2010 Proceedings IEEE INFOCOM, San Diego, CA, 2010. | |
321 | * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.212.4592&rep=rep1&type=pdf | |
322 | * | |
323 | * [Kogan 2014] | |
324 | * SAX-PAC (Scalable And eXpressive PAcket Classification) | |
325 | * Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane, | |
326 | * and Patrick Eugster. | |
327 | * Proceedings of the 2014 ACM conference on SIGCOMM, August 2014. | |
50935339 | 328 | * https://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf |
3c4287f6 SB |
329 | */ |
330 | ||
331 | #include <linux/kernel.h> | |
332 | #include <linux/init.h> | |
3c4287f6 SB |
333 | #include <linux/module.h> |
334 | #include <linux/netlink.h> | |
335 | #include <linux/netfilter.h> | |
336 | #include <linux/netfilter/nf_tables.h> | |
337 | #include <net/netfilter/nf_tables_core.h> | |
338 | #include <uapi/linux/netfilter/nf_tables.h> | |
3c4287f6 SB |
339 | #include <linux/bitmap.h> |
340 | #include <linux/bitops.h> | |
341 | ||
7400b063 | 342 | #include "nft_set_pipapo_avx2.h" |
8683f4b9 | 343 | #include "nft_set_pipapo.h" |
3c4287f6 SB |
344 | |
345 | /* Current working bitmap index, toggled between field matches */ | |
346 | static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index); | |
347 | ||
3c4287f6 SB |
348 | /** |
349 | * pipapo_refill() - For each set bit, set bits from selected mapping table item | |
350 | * @map: Bitmap to be scanned for set bits | |
351 | * @len: Length of bitmap in longs | |
352 | * @rules: Number of rules in field | |
353 | * @dst: Destination bitmap | |
354 | * @mt: Mapping table containing bit set specifiers | |
355 | * @match_only: Find a single bit and return, don't fill | |
356 | * | |
357 | * Iteration over set bits with __builtin_ctzl(): Daniel Lemire, public domain. | |
358 | * | |
359 | * For each bit set in map, select the bucket from mapping table with index | |
360 | * corresponding to the position of the bit set. Use start bit and amount of | |
361 | * bits specified in bucket to fill region in dst. | |
362 | * | |
363 | * Return: -1 on no match, bit position on 'match_only', 0 otherwise. | |
364 | */ | |
8683f4b9 SB |
365 | int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, |
366 | union nft_pipapo_map_bucket *mt, bool match_only) | |
3c4287f6 SB |
367 | { |
368 | unsigned long bitset; | |
369 | int k, ret = -1; | |
370 | ||
371 | for (k = 0; k < len; k++) { | |
372 | bitset = map[k]; | |
373 | while (bitset) { | |
374 | unsigned long t = bitset & -bitset; | |
375 | int r = __builtin_ctzl(bitset); | |
376 | int i = k * BITS_PER_LONG + r; | |
377 | ||
378 | if (unlikely(i >= rules)) { | |
379 | map[k] = 0; | |
380 | return -1; | |
381 | } | |
382 | ||
9a771204 | 383 | if (match_only) { |
3c4287f6 SB |
384 | bitmap_clear(map, i, 1); |
385 | return i; | |
386 | } | |
387 | ||
388 | ret = 0; | |
389 | ||
390 | bitmap_set(dst, mt[i].to, mt[i].n); | |
391 | ||
392 | bitset ^= t; | |
393 | } | |
394 | map[k] = 0; | |
395 | } | |
396 | ||
397 | return ret; | |
398 | } | |
399 | ||
400 | /** | |
401 | * nft_pipapo_lookup() - Lookup function | |
402 | * @net: Network namespace | |
403 | * @set: nftables API set representation | |
3db86c39 | 404 | * @key: nftables API element representation containing key data |
3c4287f6 SB |
405 | * @ext: nftables API extension pointer, filled with matching reference |
406 | * | |
407 | * For more details, see DOC: Theory of Operation. | |
408 | * | |
409 | * Return: true on match, false otherwise. | |
410 | */ | |
f0b3d338 SB |
411 | bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, |
412 | const u32 *key, const struct nft_set_ext **ext) | |
3c4287f6 SB |
413 | { |
414 | struct nft_pipapo *priv = nft_set_priv(set); | |
415 | unsigned long *res_map, *fill_map; | |
416 | u8 genmask = nft_genmask_cur(net); | |
417 | const u8 *rp = (const u8 *)key; | |
418 | struct nft_pipapo_match *m; | |
419 | struct nft_pipapo_field *f; | |
420 | bool map_index; | |
421 | int i; | |
422 | ||
423 | local_bh_disable(); | |
424 | ||
425 | map_index = raw_cpu_read(nft_pipapo_scratch_index); | |
426 | ||
427 | m = rcu_dereference(priv->match); | |
428 | ||
429 | if (unlikely(!m || !*raw_cpu_ptr(m->scratch))) | |
430 | goto out; | |
431 | ||
432 | res_map = *raw_cpu_ptr(m->scratch) + (map_index ? m->bsize_max : 0); | |
433 | fill_map = *raw_cpu_ptr(m->scratch) + (map_index ? 0 : m->bsize_max); | |
434 | ||
435 | memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); | |
436 | ||
437 | nft_pipapo_for_each_field(f, i, m) { | |
438 | bool last = i == m->field_count - 1; | |
e807b13c | 439 | int b; |
3c4287f6 | 440 | |
e807b13c | 441 | /* For each bit group: select lookup table bucket depending on |
3c4287f6 SB |
442 | * packet bytes value, then AND bucket value |
443 | */ | |
4051f431 SB |
444 | if (likely(f->bb == 8)) |
445 | pipapo_and_field_buckets_8bit(f, res_map, rp); | |
446 | else | |
447 | pipapo_and_field_buckets_4bit(f, res_map, rp); | |
448 | NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; | |
e807b13c SB |
449 | |
450 | rp += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); | |
3c4287f6 SB |
451 | |
452 | /* Now populate the bitmap for the next field, unless this is | |
453 | * the last field, in which case return the matched 'ext' | |
454 | * pointer if any. | |
455 | * | |
456 | * Now res_map contains the matching bitmap, and fill_map is the | |
457 | * bitmap for the next field. | |
458 | */ | |
459 | next_match: | |
460 | b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, | |
461 | last); | |
462 | if (b < 0) { | |
463 | raw_cpu_write(nft_pipapo_scratch_index, map_index); | |
464 | local_bh_enable(); | |
465 | ||
466 | return false; | |
467 | } | |
468 | ||
469 | if (last) { | |
470 | *ext = &f->mt[b].e->ext; | |
471 | if (unlikely(nft_set_elem_expired(*ext) || | |
472 | !nft_set_elem_active(*ext, genmask))) | |
473 | goto next_match; | |
474 | ||
475 | /* Last field: we're just returning the key without | |
476 | * filling the initial bitmap for the next field, so the | |
477 | * current inactive bitmap is clean and can be reused as | |
478 | * *next* bitmap (not initial) for the next packet. | |
479 | */ | |
480 | raw_cpu_write(nft_pipapo_scratch_index, map_index); | |
481 | local_bh_enable(); | |
482 | ||
483 | return true; | |
484 | } | |
485 | ||
486 | /* Swap bitmap indices: res_map is the initial bitmap for the | |
487 | * next field, and fill_map is guaranteed to be all-zeroes at | |
488 | * this point. | |
489 | */ | |
490 | map_index = !map_index; | |
491 | swap(res_map, fill_map); | |
492 | ||
e807b13c | 493 | rp += NFT_PIPAPO_GROUPS_PADDING(f); |
3c4287f6 SB |
494 | } |
495 | ||
496 | out: | |
497 | local_bh_enable(); | |
498 | return false; | |
499 | } | |
500 | ||
501 | /** | |
502 | * pipapo_get() - Get matching element reference given key data | |
503 | * @net: Network namespace | |
504 | * @set: nftables API set representation | |
505 | * @data: Key data to be matched against existing elements | |
506 | * @genmask: If set, check that element is active in given genmask | |
507 | * | |
508 | * This is essentially the same as the lookup function, except that it matches | |
509 | * key data against the uncommitted copy and doesn't use preallocated maps for | |
510 | * bitmap results. | |
511 | * | |
512 | * Return: pointer to &struct nft_pipapo_elem on match, error pointer otherwise. | |
513 | */ | |
514 | static struct nft_pipapo_elem *pipapo_get(const struct net *net, | |
515 | const struct nft_set *set, | |
516 | const u8 *data, u8 genmask) | |
517 | { | |
518 | struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT); | |
519 | struct nft_pipapo *priv = nft_set_priv(set); | |
520 | struct nft_pipapo_match *m = priv->clone; | |
521 | unsigned long *res_map, *fill_map = NULL; | |
522 | struct nft_pipapo_field *f; | |
523 | int i; | |
524 | ||
525 | res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); | |
526 | if (!res_map) { | |
527 | ret = ERR_PTR(-ENOMEM); | |
528 | goto out; | |
529 | } | |
530 | ||
531 | fill_map = kcalloc(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); | |
532 | if (!fill_map) { | |
533 | ret = ERR_PTR(-ENOMEM); | |
534 | goto out; | |
535 | } | |
536 | ||
537 | memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); | |
538 | ||
539 | nft_pipapo_for_each_field(f, i, m) { | |
540 | bool last = i == m->field_count - 1; | |
e807b13c | 541 | int b; |
3c4287f6 | 542 | |
e807b13c | 543 | /* For each bit group: select lookup table bucket depending on |
3c4287f6 SB |
544 | * packet bytes value, then AND bucket value |
545 | */ | |
4051f431 SB |
546 | if (f->bb == 8) |
547 | pipapo_and_field_buckets_8bit(f, res_map, data); | |
548 | else if (f->bb == 4) | |
e807b13c SB |
549 | pipapo_and_field_buckets_4bit(f, res_map, data); |
550 | else | |
551 | BUG(); | |
3c4287f6 | 552 | |
e807b13c | 553 | data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); |
3c4287f6 SB |
554 | |
555 | /* Now populate the bitmap for the next field, unless this is | |
556 | * the last field, in which case return the matched 'ext' | |
557 | * pointer if any. | |
558 | * | |
559 | * Now res_map contains the matching bitmap, and fill_map is the | |
560 | * bitmap for the next field. | |
561 | */ | |
562 | next_match: | |
563 | b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, | |
564 | last); | |
565 | if (b < 0) | |
566 | goto out; | |
567 | ||
568 | if (last) { | |
569 | if (nft_set_elem_expired(&f->mt[b].e->ext) || | |
570 | (genmask && | |
571 | !nft_set_elem_active(&f->mt[b].e->ext, genmask))) | |
572 | goto next_match; | |
573 | ||
574 | ret = f->mt[b].e; | |
575 | goto out; | |
576 | } | |
577 | ||
e807b13c | 578 | data += NFT_PIPAPO_GROUPS_PADDING(f); |
3c4287f6 SB |
579 | |
580 | /* Swap bitmap indices: fill_map will be the initial bitmap for | |
581 | * the next field (i.e. the new res_map), and res_map is | |
582 | * guaranteed to be all-zeroes at this point, ready to be filled | |
583 | * according to the next mapping table. | |
584 | */ | |
585 | swap(res_map, fill_map); | |
586 | } | |
587 | ||
588 | out: | |
589 | kfree(fill_map); | |
590 | kfree(res_map); | |
591 | return ret; | |
592 | } | |
593 | ||
594 | /** | |
595 | * nft_pipapo_get() - Get matching element reference given key data | |
596 | * @net: Network namespace | |
597 | * @set: nftables API set representation | |
598 | * @elem: nftables API element representation containing key data | |
599 | * @flags: Unused | |
600 | */ | |
eb9d7af3 CW |
601 | static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, |
602 | const struct nft_set_elem *elem, unsigned int flags) | |
3c4287f6 SB |
603 | { |
604 | return pipapo_get(net, set, (const u8 *)elem->key.val.data, | |
605 | nft_genmask_cur(net)); | |
606 | } | |
607 | ||
608 | /** | |
609 | * pipapo_resize() - Resize lookup or mapping table, or both | |
610 | * @f: Field containing lookup and mapping tables | |
611 | * @old_rules: Previous amount of rules in field | |
612 | * @rules: New amount of rules | |
613 | * | |
614 | * Increase, decrease or maintain tables size depending on new amount of rules, | |
615 | * and copy data over. In case the new size is smaller, throw away data for | |
616 | * highest-numbered rules. | |
617 | * | |
618 | * Return: 0 on success, -ENOMEM on allocation failure. | |
619 | */ | |
620 | static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules) | |
621 | { | |
622 | long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p; | |
623 | union nft_pipapo_map_bucket *new_mt, *old_mt = f->mt; | |
624 | size_t new_bucket_size, copy; | |
625 | int group, bucket; | |
626 | ||
627 | new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG); | |
bf3e5839 SB |
628 | #ifdef NFT_PIPAPO_ALIGN |
629 | new_bucket_size = roundup(new_bucket_size, | |
630 | NFT_PIPAPO_ALIGN / sizeof(*new_lt)); | |
631 | #endif | |
3c4287f6 SB |
632 | |
633 | if (new_bucket_size == f->bsize) | |
634 | goto mt; | |
635 | ||
636 | if (new_bucket_size > f->bsize) | |
637 | copy = f->bsize; | |
638 | else | |
639 | copy = new_bucket_size; | |
640 | ||
e807b13c | 641 | new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS(f->bb) * |
bf3e5839 SB |
642 | new_bucket_size * sizeof(*new_lt) + |
643 | NFT_PIPAPO_ALIGN_HEADROOM, | |
644 | GFP_KERNEL); | |
3c4287f6 SB |
645 | if (!new_lt) |
646 | return -ENOMEM; | |
647 | ||
bf3e5839 SB |
648 | new_p = NFT_PIPAPO_LT_ALIGN(new_lt); |
649 | old_p = NFT_PIPAPO_LT_ALIGN(old_lt); | |
650 | ||
3c4287f6 | 651 | for (group = 0; group < f->groups; group++) { |
e807b13c | 652 | for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS(f->bb); bucket++) { |
3c4287f6 SB |
653 | memcpy(new_p, old_p, copy * sizeof(*new_p)); |
654 | new_p += copy; | |
655 | old_p += copy; | |
656 | ||
657 | if (new_bucket_size > f->bsize) | |
658 | new_p += new_bucket_size - f->bsize; | |
659 | else | |
660 | old_p += f->bsize - new_bucket_size; | |
661 | } | |
662 | } | |
663 | ||
664 | mt: | |
665 | new_mt = kvmalloc(rules * sizeof(*new_mt), GFP_KERNEL); | |
666 | if (!new_mt) { | |
667 | kvfree(new_lt); | |
668 | return -ENOMEM; | |
669 | } | |
670 | ||
671 | memcpy(new_mt, f->mt, min(old_rules, rules) * sizeof(*new_mt)); | |
672 | if (rules > old_rules) { | |
673 | memset(new_mt + old_rules, 0, | |
674 | (rules - old_rules) * sizeof(*new_mt)); | |
675 | } | |
676 | ||
677 | if (new_lt) { | |
678 | f->bsize = new_bucket_size; | |
bf3e5839 | 679 | NFT_PIPAPO_LT_ASSIGN(f, new_lt); |
3c4287f6 SB |
680 | kvfree(old_lt); |
681 | } | |
682 | ||
683 | f->mt = new_mt; | |
684 | kvfree(old_mt); | |
685 | ||
686 | return 0; | |
687 | } | |
688 | ||
689 | /** | |
690 | * pipapo_bucket_set() - Set rule bit in bucket given group and group value | |
691 | * @f: Field containing lookup table | |
692 | * @rule: Rule index | |
693 | * @group: Group index | |
694 | * @v: Value of bit group | |
695 | */ | |
696 | static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group, | |
697 | int v) | |
698 | { | |
699 | unsigned long *pos; | |
700 | ||
bf3e5839 SB |
701 | pos = NFT_PIPAPO_LT_ALIGN(f->lt); |
702 | pos += f->bsize * NFT_PIPAPO_BUCKETS(f->bb) * group; | |
3c4287f6 SB |
703 | pos += f->bsize * v; |
704 | ||
705 | __set_bit(rule, pos); | |
706 | } | |
707 | ||
4051f431 SB |
708 | /** |
709 | * pipapo_lt_4b_to_8b() - Switch lookup table group width from 4 bits to 8 bits | |
710 | * @old_groups: Number of current groups | |
711 | * @bsize: Size of one bucket, in longs | |
712 | * @old_lt: Pointer to the current lookup table | |
713 | * @new_lt: Pointer to the new, pre-allocated lookup table | |
714 | * | |
715 | * Each bucket with index b in the new lookup table, belonging to group g, is | |
716 | * filled with the bit intersection between: | |
717 | * - bucket with index given by the upper 4 bits of b, from group g, and | |
718 | * - bucket with index given by the lower 4 bits of b, from group g + 1 | |
719 | * | |
720 | * That is, given buckets from the new lookup table N(x, y) and the old lookup | |
721 | * table O(x, y), with x bucket index, and y group index: | |
722 | * | |
723 | * N(b, g) := O(b / 16, g) & O(b % 16, g + 1) | |
724 | * | |
725 | * This ensures equivalence of the matching results on lookup. Two examples in | |
726 | * pictures: | |
727 | * | |
728 | * bucket | |
729 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ... 254 255 | |
730 | * 0 ^ | |
731 | * 1 | ^ | |
732 | * ... ( & ) | | |
733 | * / \ | | |
734 | * / \ .-( & )-. | |
735 | * / bucket \ | | | |
736 | * group 0 / 1 2 3 \ 4 5 6 7 8 9 10 11 12 13 |14 15 | | |
737 | * 0 / \ | | | |
738 | * 1 \ | | | |
739 | * 2 | --' | |
740 | * 3 '- | |
741 | * ... | |
742 | */ | |
743 | static void pipapo_lt_4b_to_8b(int old_groups, int bsize, | |
744 | unsigned long *old_lt, unsigned long *new_lt) | |
745 | { | |
746 | int g, b, i; | |
747 | ||
748 | for (g = 0; g < old_groups / 2; g++) { | |
749 | int src_g0 = g * 2, src_g1 = g * 2 + 1; | |
750 | ||
751 | for (b = 0; b < NFT_PIPAPO_BUCKETS(8); b++) { | |
752 | int src_b0 = b / NFT_PIPAPO_BUCKETS(4); | |
753 | int src_b1 = b % NFT_PIPAPO_BUCKETS(4); | |
754 | int src_i0 = src_g0 * NFT_PIPAPO_BUCKETS(4) + src_b0; | |
755 | int src_i1 = src_g1 * NFT_PIPAPO_BUCKETS(4) + src_b1; | |
756 | ||
757 | for (i = 0; i < bsize; i++) { | |
758 | *new_lt = old_lt[src_i0 * bsize + i] & | |
759 | old_lt[src_i1 * bsize + i]; | |
760 | new_lt++; | |
761 | } | |
762 | } | |
763 | } | |
764 | } | |
765 | ||
766 | /** | |
767 | * pipapo_lt_8b_to_4b() - Switch lookup table group width from 8 bits to 4 bits | |
768 | * @old_groups: Number of current groups | |
769 | * @bsize: Size of one bucket, in longs | |
770 | * @old_lt: Pointer to the current lookup table | |
771 | * @new_lt: Pointer to the new, pre-allocated lookup table | |
772 | * | |
773 | * Each bucket with index b in the new lookup table, belonging to group g, is | |
774 | * filled with the bit union of: | |
775 | * - all the buckets with index such that the upper four bits of the lower byte | |
776 | * equal b, from group g, with g odd | |
777 | * - all the buckets with index such that the lower four bits equal b, from | |
778 | * group g, with g even | |
779 | * | |
780 | * That is, given buckets from the new lookup table N(x, y) and the old lookup | |
781 | * table O(x, y), with x bucket index, and y group index: | |
782 | * | |
783 | * - with g odd: N(b, g) := U(O(x, g) for each x : x = (b & 0xf0) >> 4) | |
784 | * - with g even: N(b, g) := U(O(x, g) for each x : x = b & 0x0f) | |
785 | * | |
786 | * where U() denotes the arbitrary union operation (binary OR of n terms). This | |
787 | * ensures equivalence of the matching results on lookup. | |
788 | */ | |
789 | static void pipapo_lt_8b_to_4b(int old_groups, int bsize, | |
790 | unsigned long *old_lt, unsigned long *new_lt) | |
791 | { | |
792 | int g, b, bsrc, i; | |
793 | ||
794 | memset(new_lt, 0, old_groups * 2 * NFT_PIPAPO_BUCKETS(4) * bsize * | |
795 | sizeof(unsigned long)); | |
796 | ||
797 | for (g = 0; g < old_groups * 2; g += 2) { | |
798 | int src_g = g / 2; | |
799 | ||
800 | for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) { | |
801 | for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g; | |
802 | bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1); | |
803 | bsrc++) { | |
804 | if (((bsrc & 0xf0) >> 4) != b) | |
805 | continue; | |
806 | ||
807 | for (i = 0; i < bsize; i++) | |
808 | new_lt[i] |= old_lt[bsrc * bsize + i]; | |
809 | } | |
810 | ||
811 | new_lt += bsize; | |
812 | } | |
813 | ||
814 | for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) { | |
815 | for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g; | |
816 | bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1); | |
817 | bsrc++) { | |
818 | if ((bsrc & 0x0f) != b) | |
819 | continue; | |
820 | ||
821 | for (i = 0; i < bsize; i++) | |
822 | new_lt[i] |= old_lt[bsrc * bsize + i]; | |
823 | } | |
824 | ||
825 | new_lt += bsize; | |
826 | } | |
827 | } | |
828 | } | |
829 | ||
830 | /** | |
831 | * pipapo_lt_bits_adjust() - Adjust group size for lookup table if needed | |
832 | * @f: Field containing lookup table | |
833 | */ | |
834 | static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) | |
835 | { | |
836 | unsigned long *new_lt; | |
837 | int groups, bb; | |
838 | size_t lt_size; | |
839 | ||
840 | lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize * | |
841 | sizeof(*f->lt); | |
842 | ||
843 | if (f->bb == NFT_PIPAPO_GROUP_BITS_SMALL_SET && | |
844 | lt_size > NFT_PIPAPO_LT_SIZE_HIGH) { | |
845 | groups = f->groups * 2; | |
846 | bb = NFT_PIPAPO_GROUP_BITS_LARGE_SET; | |
847 | ||
848 | lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize * | |
849 | sizeof(*f->lt); | |
850 | } else if (f->bb == NFT_PIPAPO_GROUP_BITS_LARGE_SET && | |
851 | lt_size < NFT_PIPAPO_LT_SIZE_LOW) { | |
852 | groups = f->groups / 2; | |
853 | bb = NFT_PIPAPO_GROUP_BITS_SMALL_SET; | |
854 | ||
855 | lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize * | |
856 | sizeof(*f->lt); | |
857 | ||
858 | /* Don't increase group width if the resulting lookup table size | |
859 | * would exceed the upper size threshold for a "small" set. | |
860 | */ | |
861 | if (lt_size > NFT_PIPAPO_LT_SIZE_HIGH) | |
862 | return; | |
863 | } else { | |
864 | return; | |
865 | } | |
866 | ||
bf3e5839 | 867 | new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL); |
4051f431 SB |
868 | if (!new_lt) |
869 | return; | |
870 | ||
871 | NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; | |
bf3e5839 SB |
872 | if (f->bb == 4 && bb == 8) { |
873 | pipapo_lt_4b_to_8b(f->groups, f->bsize, | |
874 | NFT_PIPAPO_LT_ALIGN(f->lt), | |
875 | NFT_PIPAPO_LT_ALIGN(new_lt)); | |
876 | } else if (f->bb == 8 && bb == 4) { | |
877 | pipapo_lt_8b_to_4b(f->groups, f->bsize, | |
878 | NFT_PIPAPO_LT_ALIGN(f->lt), | |
879 | NFT_PIPAPO_LT_ALIGN(new_lt)); | |
880 | } else { | |
4051f431 | 881 | BUG(); |
bf3e5839 | 882 | } |
4051f431 SB |
883 | |
884 | f->groups = groups; | |
885 | f->bb = bb; | |
886 | kvfree(f->lt); | |
bf3e5839 | 887 | NFT_PIPAPO_LT_ASSIGN(f, new_lt); |
4051f431 SB |
888 | } |
889 | ||
3c4287f6 SB |
890 | /** |
891 | * pipapo_insert() - Insert new rule in field given input key and mask length | |
892 | * @f: Field containing lookup table | |
893 | * @k: Input key for classification, without nftables padding | |
894 | * @mask_bits: Length of mask; matches field length for non-ranged entry | |
895 | * | |
896 | * Insert a new rule reference in lookup buckets corresponding to k and | |
897 | * mask_bits. | |
898 | * | |
899 | * Return: 1 on success (one rule inserted), negative error code on failure. | |
900 | */ | |
901 | static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k, | |
902 | int mask_bits) | |
903 | { | |
e807b13c | 904 | int rule = f->rules++, group, ret, bit_offset = 0; |
3c4287f6 SB |
905 | |
906 | ret = pipapo_resize(f, f->rules - 1, f->rules); | |
907 | if (ret) | |
908 | return ret; | |
909 | ||
910 | for (group = 0; group < f->groups; group++) { | |
911 | int i, v; | |
912 | u8 mask; | |
913 | ||
e807b13c SB |
914 | v = k[group / (BITS_PER_BYTE / f->bb)]; |
915 | v &= GENMASK(BITS_PER_BYTE - bit_offset - 1, 0); | |
916 | v >>= (BITS_PER_BYTE - bit_offset) - f->bb; | |
3c4287f6 | 917 | |
e807b13c SB |
918 | bit_offset += f->bb; |
919 | bit_offset %= BITS_PER_BYTE; | |
920 | ||
921 | if (mask_bits >= (group + 1) * f->bb) { | |
3c4287f6 SB |
922 | /* Not masked */ |
923 | pipapo_bucket_set(f, rule, group, v); | |
e807b13c | 924 | } else if (mask_bits <= group * f->bb) { |
3c4287f6 | 925 | /* Completely masked */ |
e807b13c | 926 | for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++) |
3c4287f6 SB |
927 | pipapo_bucket_set(f, rule, group, i); |
928 | } else { | |
929 | /* The mask limit falls on this group */ | |
e807b13c SB |
930 | mask = GENMASK(f->bb - 1, 0); |
931 | mask >>= mask_bits - group * f->bb; | |
932 | for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++) { | |
3c4287f6 SB |
933 | if ((i & ~mask) == (v & ~mask)) |
934 | pipapo_bucket_set(f, rule, group, i); | |
935 | } | |
936 | } | |
937 | } | |
938 | ||
4051f431 SB |
939 | pipapo_lt_bits_adjust(f); |
940 | ||
3c4287f6 SB |
941 | return 1; |
942 | } | |
943 | ||
944 | /** | |
945 | * pipapo_step_diff() - Check if setting @step bit in netmask would change it | |
946 | * @base: Mask we are expanding | |
947 | * @step: Step bit for given expansion step | |
948 | * @len: Total length of mask space (set and unset bits), bytes | |
949 | * | |
950 | * Convenience function for mask expansion. | |
951 | * | |
952 | * Return: true if step bit changes mask (i.e. isn't set), false otherwise. | |
953 | */ | |
954 | static bool pipapo_step_diff(u8 *base, int step, int len) | |
955 | { | |
956 | /* Network order, byte-addressed */ | |
957 | #ifdef __BIG_ENDIAN__ | |
958 | return !(BIT(step % BITS_PER_BYTE) & base[step / BITS_PER_BYTE]); | |
959 | #else | |
960 | return !(BIT(step % BITS_PER_BYTE) & | |
961 | base[len - 1 - step / BITS_PER_BYTE]); | |
962 | #endif | |
963 | } | |
964 | ||
965 | /** | |
966 | * pipapo_step_after_end() - Check if mask exceeds range end with given step | |
967 | * @base: Mask we are expanding | |
968 | * @end: End of range | |
969 | * @step: Step bit for given expansion step, highest bit to be set | |
970 | * @len: Total length of mask space (set and unset bits), bytes | |
971 | * | |
972 | * Convenience function for mask expansion. | |
973 | * | |
974 | * Return: true if mask exceeds range setting step bits, false otherwise. | |
975 | */ | |
976 | static bool pipapo_step_after_end(const u8 *base, const u8 *end, int step, | |
977 | int len) | |
978 | { | |
979 | u8 tmp[NFT_PIPAPO_MAX_BYTES]; | |
980 | int i; | |
981 | ||
982 | memcpy(tmp, base, len); | |
983 | ||
984 | /* Network order, byte-addressed */ | |
985 | for (i = 0; i <= step; i++) | |
986 | #ifdef __BIG_ENDIAN__ | |
987 | tmp[i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); | |
988 | #else | |
989 | tmp[len - 1 - i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); | |
990 | #endif | |
991 | ||
992 | return memcmp(tmp, end, len) > 0; | |
993 | } | |
994 | ||
995 | /** | |
996 | * pipapo_base_sum() - Sum step bit to given len-sized netmask base with carry | |
997 | * @base: Netmask base | |
998 | * @step: Step bit to sum | |
999 | * @len: Netmask length, bytes | |
1000 | */ | |
1001 | static void pipapo_base_sum(u8 *base, int step, int len) | |
1002 | { | |
1003 | bool carry = false; | |
1004 | int i; | |
1005 | ||
1006 | /* Network order, byte-addressed */ | |
1007 | #ifdef __BIG_ENDIAN__ | |
1008 | for (i = step / BITS_PER_BYTE; i < len; i++) { | |
1009 | #else | |
1010 | for (i = len - 1 - step / BITS_PER_BYTE; i >= 0; i--) { | |
1011 | #endif | |
1012 | if (carry) | |
1013 | base[i]++; | |
1014 | else | |
1015 | base[i] += 1 << (step % BITS_PER_BYTE); | |
1016 | ||
1017 | if (base[i]) | |
1018 | break; | |
1019 | ||
1020 | carry = true; | |
1021 | } | |
1022 | } | |
1023 | ||
1024 | /** | |
1025 | * pipapo_expand() - Expand to composing netmasks, insert into lookup table | |
1026 | * @f: Field containing lookup table | |
1027 | * @start: Start of range | |
1028 | * @end: End of range | |
1029 | * @len: Length of value in bits | |
1030 | * | |
1031 | * Expand range to composing netmasks and insert corresponding rule references | |
1032 | * in lookup buckets. | |
1033 | * | |
1034 | * Return: number of inserted rules on success, negative error code on failure. | |
1035 | */ | |
1036 | static int pipapo_expand(struct nft_pipapo_field *f, | |
1037 | const u8 *start, const u8 *end, int len) | |
1038 | { | |
1039 | int step, masks = 0, bytes = DIV_ROUND_UP(len, BITS_PER_BYTE); | |
1040 | u8 base[NFT_PIPAPO_MAX_BYTES]; | |
1041 | ||
1042 | memcpy(base, start, bytes); | |
1043 | while (memcmp(base, end, bytes) <= 0) { | |
1044 | int err; | |
1045 | ||
1046 | step = 0; | |
1047 | while (pipapo_step_diff(base, step, bytes)) { | |
1048 | if (pipapo_step_after_end(base, end, step, bytes)) | |
1049 | break; | |
1050 | ||
1051 | step++; | |
1052 | if (step >= len) { | |
1053 | if (!masks) { | |
1054 | pipapo_insert(f, base, 0); | |
1055 | masks = 1; | |
1056 | } | |
1057 | goto out; | |
1058 | } | |
1059 | } | |
1060 | ||
1061 | err = pipapo_insert(f, base, len - step); | |
1062 | ||
1063 | if (err < 0) | |
1064 | return err; | |
1065 | ||
1066 | masks++; | |
1067 | pipapo_base_sum(base, step, bytes); | |
1068 | } | |
1069 | out: | |
1070 | return masks; | |
1071 | } | |
1072 | ||
1073 | /** | |
1074 | * pipapo_map() - Insert rules in mapping tables, mapping them between fields | |
1075 | * @m: Matching data, including mapping table | |
1076 | * @map: Table of rule maps: array of first rule and amount of rules | |
1077 | * in next field a given rule maps to, for each field | |
3db86c39 | 1078 | * @e: For last field, nft_set_ext pointer matching rules map to |
3c4287f6 SB |
1079 | */ |
1080 | static void pipapo_map(struct nft_pipapo_match *m, | |
1081 | union nft_pipapo_map_bucket map[NFT_PIPAPO_MAX_FIELDS], | |
1082 | struct nft_pipapo_elem *e) | |
1083 | { | |
1084 | struct nft_pipapo_field *f; | |
1085 | int i, j; | |
1086 | ||
1087 | for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) { | |
1088 | for (j = 0; j < map[i].n; j++) { | |
1089 | f->mt[map[i].to + j].to = map[i + 1].to; | |
1090 | f->mt[map[i].to + j].n = map[i + 1].n; | |
1091 | } | |
1092 | } | |
1093 | ||
1094 | /* Last field: map to ext instead of mapping to next field */ | |
1095 | for (j = 0; j < map[i].n; j++) | |
1096 | f->mt[map[i].to + j].e = e; | |
1097 | } | |
1098 | ||
1099 | /** | |
1100 | * pipapo_realloc_scratch() - Reallocate scratch maps for partial match results | |
1101 | * @clone: Copy of matching data with pending insertions and deletions | |
3db86c39 | 1102 | * @bsize_max: Maximum bucket size, scratch maps cover two buckets |
3c4287f6 SB |
1103 | * |
1104 | * Return: 0 on success, -ENOMEM on failure. | |
1105 | */ | |
1106 | static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, | |
1107 | unsigned long bsize_max) | |
1108 | { | |
1109 | int i; | |
1110 | ||
1111 | for_each_possible_cpu(i) { | |
1112 | unsigned long *scratch; | |
bf3e5839 SB |
1113 | #ifdef NFT_PIPAPO_ALIGN |
1114 | unsigned long *scratch_aligned; | |
1115 | #endif | |
3c4287f6 | 1116 | |
bf3e5839 SB |
1117 | scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2 + |
1118 | NFT_PIPAPO_ALIGN_HEADROOM, | |
3c4287f6 SB |
1119 | GFP_KERNEL, cpu_to_node(i)); |
1120 | if (!scratch) { | |
1121 | /* On failure, there's no need to undo previous | |
1122 | * allocations: this means that some scratch maps have | |
1123 | * a bigger allocated size now (this is only called on | |
1124 | * insertion), but the extra space won't be used by any | |
1125 | * CPU as new elements are not inserted and m->bsize_max | |
1126 | * is not updated. | |
1127 | */ | |
1128 | return -ENOMEM; | |
1129 | } | |
1130 | ||
1131 | kfree(*per_cpu_ptr(clone->scratch, i)); | |
1132 | ||
1133 | *per_cpu_ptr(clone->scratch, i) = scratch; | |
bf3e5839 SB |
1134 | |
1135 | #ifdef NFT_PIPAPO_ALIGN | |
1136 | scratch_aligned = NFT_PIPAPO_LT_ALIGN(scratch); | |
1137 | *per_cpu_ptr(clone->scratch_aligned, i) = scratch_aligned; | |
1138 | #endif | |
3c4287f6 SB |
1139 | } |
1140 | ||
1141 | return 0; | |
1142 | } | |
1143 | ||
1144 | /** | |
1145 | * nft_pipapo_insert() - Validate and insert ranged elements | |
1146 | * @net: Network namespace | |
1147 | * @set: nftables API set representation | |
1148 | * @elem: nftables API element representation containing key data | |
1149 | * @ext2: Filled with pointer to &struct nft_set_ext in inserted element | |
1150 | * | |
1151 | * Return: 0 on success, error pointer on failure. | |
1152 | */ | |
1153 | static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, | |
1154 | const struct nft_set_elem *elem, | |
1155 | struct nft_set_ext **ext2) | |
1156 | { | |
1157 | const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); | |
1158 | union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; | |
1159 | const u8 *start = (const u8 *)elem->key.val.data, *end; | |
1160 | struct nft_pipapo_elem *e = elem->priv, *dup; | |
1161 | struct nft_pipapo *priv = nft_set_priv(set); | |
1162 | struct nft_pipapo_match *m = priv->clone; | |
1163 | u8 genmask = nft_genmask_next(net); | |
1164 | struct nft_pipapo_field *f; | |
1165 | int i, bsize_max, err = 0; | |
1166 | ||
0eb4b5ee SB |
1167 | if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) |
1168 | end = (const u8 *)nft_set_ext_key_end(ext)->data; | |
1169 | else | |
1170 | end = start; | |
1171 | ||
3c4287f6 | 1172 | dup = pipapo_get(net, set, start, genmask); |
0eb4b5ee SB |
1173 | if (!IS_ERR(dup)) { |
1174 | /* Check if we already have the same exact entry */ | |
1175 | const struct nft_data *dup_key, *dup_end; | |
1176 | ||
1177 | dup_key = nft_set_ext_key(&dup->ext); | |
1178 | if (nft_set_ext_exists(&dup->ext, NFT_SET_EXT_KEY_END)) | |
1179 | dup_end = nft_set_ext_key_end(&dup->ext); | |
1180 | else | |
1181 | dup_end = dup_key; | |
1182 | ||
1183 | if (!memcmp(start, dup_key->data, sizeof(*dup_key->data)) && | |
1184 | !memcmp(end, dup_end->data, sizeof(*dup_end->data))) { | |
1185 | *ext2 = &dup->ext; | |
1186 | return -EEXIST; | |
3c4287f6 | 1187 | } |
0eb4b5ee SB |
1188 | |
1189 | return -ENOTEMPTY; | |
1190 | } | |
1191 | ||
1192 | if (PTR_ERR(dup) == -ENOENT) { | |
1193 | /* Look for partially overlapping entries */ | |
1194 | dup = pipapo_get(net, set, end, nft_genmask_next(net)); | |
3c4287f6 SB |
1195 | } |
1196 | ||
1197 | if (PTR_ERR(dup) != -ENOENT) { | |
1198 | if (IS_ERR(dup)) | |
1199 | return PTR_ERR(dup); | |
1200 | *ext2 = &dup->ext; | |
0eb4b5ee | 1201 | return -ENOTEMPTY; |
3c4287f6 SB |
1202 | } |
1203 | ||
1204 | /* Validate */ | |
1205 | nft_pipapo_for_each_field(f, i, m) { | |
1206 | const u8 *start_p = start, *end_p = end; | |
1207 | ||
1208 | if (f->rules >= (unsigned long)NFT_PIPAPO_RULE0_MAX) | |
1209 | return -ENOSPC; | |
1210 | ||
1211 | if (memcmp(start_p, end_p, | |
e807b13c | 1212 | f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) > 0) |
3c4287f6 SB |
1213 | return -EINVAL; |
1214 | ||
e807b13c SB |
1215 | start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); |
1216 | end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); | |
3c4287f6 SB |
1217 | } |
1218 | ||
1219 | /* Insert */ | |
1220 | priv->dirty = true; | |
1221 | ||
1222 | bsize_max = m->bsize_max; | |
1223 | ||
1224 | nft_pipapo_for_each_field(f, i, m) { | |
1225 | int ret; | |
1226 | ||
1227 | rulemap[i].to = f->rules; | |
1228 | ||
1229 | ret = memcmp(start, end, | |
e807b13c SB |
1230 | f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)); |
1231 | if (!ret) | |
1232 | ret = pipapo_insert(f, start, f->groups * f->bb); | |
1233 | else | |
1234 | ret = pipapo_expand(f, start, end, f->groups * f->bb); | |
3c4287f6 SB |
1235 | |
1236 | if (f->bsize > bsize_max) | |
1237 | bsize_max = f->bsize; | |
1238 | ||
1239 | rulemap[i].n = ret; | |
1240 | ||
e807b13c SB |
1241 | start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); |
1242 | end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); | |
3c4287f6 SB |
1243 | } |
1244 | ||
c3829285 SB |
1245 | if (!*get_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) { |
1246 | put_cpu_ptr(m->scratch); | |
1247 | ||
3c4287f6 SB |
1248 | err = pipapo_realloc_scratch(m, bsize_max); |
1249 | if (err) | |
1250 | return err; | |
1251 | ||
3c4287f6 | 1252 | m->bsize_max = bsize_max; |
c3829285 SB |
1253 | } else { |
1254 | put_cpu_ptr(m->scratch); | |
3c4287f6 SB |
1255 | } |
1256 | ||
1257 | *ext2 = &e->ext; | |
1258 | ||
1259 | pipapo_map(m, rulemap, e); | |
1260 | ||
1261 | return 0; | |
1262 | } | |
1263 | ||
1264 | /** | |
1265 | * pipapo_clone() - Clone matching data to create new working copy | |
1266 | * @old: Existing matching data | |
1267 | * | |
1268 | * Return: copy of matching data passed as 'old', error pointer on failure | |
1269 | */ | |
1270 | static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) | |
1271 | { | |
1272 | struct nft_pipapo_field *dst, *src; | |
1273 | struct nft_pipapo_match *new; | |
1274 | int i; | |
1275 | ||
1276 | new = kmalloc(sizeof(*new) + sizeof(*dst) * old->field_count, | |
1277 | GFP_KERNEL); | |
1278 | if (!new) | |
1279 | return ERR_PTR(-ENOMEM); | |
1280 | ||
1281 | new->field_count = old->field_count; | |
1282 | new->bsize_max = old->bsize_max; | |
1283 | ||
1284 | new->scratch = alloc_percpu(*new->scratch); | |
1285 | if (!new->scratch) | |
1286 | goto out_scratch; | |
1287 | ||
bf3e5839 SB |
1288 | #ifdef NFT_PIPAPO_ALIGN |
1289 | new->scratch_aligned = alloc_percpu(*new->scratch_aligned); | |
1290 | if (!new->scratch_aligned) | |
1291 | goto out_scratch; | |
1292 | #endif | |
23c54263 FW |
1293 | for_each_possible_cpu(i) |
1294 | *per_cpu_ptr(new->scratch, i) = NULL; | |
1295 | ||
1296 | if (pipapo_realloc_scratch(new, old->bsize_max)) | |
1297 | goto out_scratch_realloc; | |
bf3e5839 | 1298 | |
3c4287f6 SB |
1299 | rcu_head_init(&new->rcu); |
1300 | ||
1301 | src = old->f; | |
1302 | dst = new->f; | |
1303 | ||
1304 | for (i = 0; i < old->field_count; i++) { | |
bf3e5839 SB |
1305 | unsigned long *new_lt; |
1306 | ||
3c4287f6 SB |
1307 | memcpy(dst, src, offsetof(struct nft_pipapo_field, lt)); |
1308 | ||
bf3e5839 SB |
1309 | new_lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS(src->bb) * |
1310 | src->bsize * sizeof(*dst->lt) + | |
1311 | NFT_PIPAPO_ALIGN_HEADROOM, | |
1312 | GFP_KERNEL); | |
1313 | if (!new_lt) | |
3c4287f6 SB |
1314 | goto out_lt; |
1315 | ||
bf3e5839 SB |
1316 | NFT_PIPAPO_LT_ASSIGN(dst, new_lt); |
1317 | ||
1318 | memcpy(NFT_PIPAPO_LT_ALIGN(new_lt), | |
1319 | NFT_PIPAPO_LT_ALIGN(src->lt), | |
3c4287f6 | 1320 | src->bsize * sizeof(*dst->lt) * |
e807b13c | 1321 | src->groups * NFT_PIPAPO_BUCKETS(src->bb)); |
3c4287f6 SB |
1322 | |
1323 | dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL); | |
1324 | if (!dst->mt) | |
1325 | goto out_mt; | |
1326 | ||
1327 | memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt)); | |
1328 | src++; | |
1329 | dst++; | |
1330 | } | |
1331 | ||
1332 | return new; | |
1333 | ||
1334 | out_mt: | |
1335 | kvfree(dst->lt); | |
1336 | out_lt: | |
1337 | for (dst--; i > 0; i--) { | |
1338 | kvfree(dst->mt); | |
1339 | kvfree(dst->lt); | |
1340 | dst--; | |
1341 | } | |
23c54263 FW |
1342 | out_scratch_realloc: |
1343 | for_each_possible_cpu(i) | |
1344 | kfree(*per_cpu_ptr(new->scratch, i)); | |
bf3e5839 SB |
1345 | #ifdef NFT_PIPAPO_ALIGN |
1346 | free_percpu(new->scratch_aligned); | |
1347 | #endif | |
3c4287f6 | 1348 | out_scratch: |
bf3e5839 | 1349 | free_percpu(new->scratch); |
3c4287f6 SB |
1350 | kfree(new); |
1351 | ||
1352 | return ERR_PTR(-ENOMEM); | |
1353 | } | |
1354 | ||
1355 | /** | |
1356 | * pipapo_rules_same_key() - Get number of rules originated from the same entry | |
1357 | * @f: Field containing mapping table | |
1358 | * @first: Index of first rule in set of rules mapping to same entry | |
1359 | * | |
1360 | * Using the fact that all rules in a field that originated from the same entry | |
1361 | * will map to the same set of rules in the next field, or to the same element | |
1362 | * reference, return the cardinality of the set of rules that originated from | |
1363 | * the same entry as the rule with index @first, @first rule included. | |
1364 | * | |
1365 | * In pictures: | |
1366 | * rules | |
1367 | * field #0 0 1 2 3 4 | |
1368 | * map to: 0 1 2-4 2-4 5-9 | |
1369 | * . . ....... . ... | |
1370 | * | | | | \ \ | |
1371 | * | | | | \ \ | |
1372 | * | | | | \ \ | |
1373 | * ' ' ' ' ' \ | |
1374 | * in field #1 0 1 2 3 4 5 ... | |
1375 | * | |
1376 | * if this is called for rule 2 on field #0, it will return 3, as also rules 2 | |
1377 | * and 3 in field 0 map to the same set of rules (2, 3, 4) in the next field. | |
1378 | * | |
1379 | * For the last field in a set, we can rely on associated entries to map to the | |
1380 | * same element references. | |
1381 | * | |
1382 | * Return: Number of rules that originated from the same entry as @first. | |
1383 | */ | |
1384 | static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first) | |
1385 | { | |
1386 | struct nft_pipapo_elem *e = NULL; /* Keep gcc happy */ | |
1387 | int r; | |
1388 | ||
1389 | for (r = first; r < f->rules; r++) { | |
1390 | if (r != first && e != f->mt[r].e) | |
1391 | return r - first; | |
1392 | ||
1393 | e = f->mt[r].e; | |
1394 | } | |
1395 | ||
1396 | if (r != first) | |
1397 | return r - first; | |
1398 | ||
1399 | return 0; | |
1400 | } | |
1401 | ||
1402 | /** | |
1403 | * pipapo_unmap() - Remove rules from mapping tables, renumber remaining ones | |
1404 | * @mt: Mapping array | |
1405 | * @rules: Original amount of rules in mapping table | |
1406 | * @start: First rule index to be removed | |
1407 | * @n: Amount of rules to be removed | |
1408 | * @to_offset: First rule index, in next field, this group of rules maps to | |
1409 | * @is_last: If this is the last field, delete reference from mapping array | |
1410 | * | |
1411 | * This is used to unmap rules from the mapping table for a single field, | |
1412 | * maintaining consistency and compactness for the existing ones. | |
1413 | * | |
1414 | * In pictures: let's assume that we want to delete rules 2 and 3 from the | |
1415 | * following mapping array: | |
1416 | * | |
1417 | * rules | |
1418 | * 0 1 2 3 4 | |
1419 | * map to: 4-10 4-10 11-15 11-15 16-18 | |
1420 | * | |
1421 | * the result will be: | |
1422 | * | |
1423 | * rules | |
1424 | * 0 1 2 | |
1425 | * map to: 4-10 4-10 11-13 | |
1426 | * | |
1427 | * for fields before the last one. In case this is the mapping table for the | |
1428 | * last field in a set, and rules map to pointers to &struct nft_pipapo_elem: | |
1429 | * | |
1430 | * rules | |
1431 | * 0 1 2 3 4 | |
1432 | * element pointers: 0x42 0x42 0x33 0x33 0x44 | |
1433 | * | |
1434 | * the result will be: | |
1435 | * | |
1436 | * rules | |
1437 | * 0 1 2 | |
1438 | * element pointers: 0x42 0x42 0x44 | |
1439 | */ | |
1440 | static void pipapo_unmap(union nft_pipapo_map_bucket *mt, int rules, | |
1441 | int start, int n, int to_offset, bool is_last) | |
1442 | { | |
1443 | int i; | |
1444 | ||
1445 | memmove(mt + start, mt + start + n, (rules - start - n) * sizeof(*mt)); | |
1446 | memset(mt + rules - n, 0, n * sizeof(*mt)); | |
1447 | ||
1448 | if (is_last) | |
1449 | return; | |
1450 | ||
1451 | for (i = start; i < rules - n; i++) | |
1452 | mt[i].to -= to_offset; | |
1453 | } | |
1454 | ||
1455 | /** | |
1456 | * pipapo_drop() - Delete entry from lookup and mapping tables, given rule map | |
1457 | * @m: Matching data | |
3db86c39 | 1458 | * @rulemap: Table of rule maps, arrays of first rule and amount of rules |
3c4287f6 SB |
1459 | * in next field a given entry maps to, for each field |
1460 | * | |
1461 | * For each rule in lookup table buckets mapping to this set of rules, drop | |
1462 | * all bits set in lookup table mapping. In pictures, assuming we want to drop | |
1463 | * rules 0 and 1 from this lookup table: | |
1464 | * | |
1465 | * bucket | |
1466 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
1467 | * 0 0 1,2 | |
1468 | * 1 1,2 0 | |
1469 | * 2 0 1,2 | |
1470 | * 3 0 1,2 | |
1471 | * 4 0,1,2 | |
1472 | * 5 0 1 2 | |
1473 | * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | |
1474 | * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 | |
1475 | * | |
1476 | * rule 2 becomes rule 0, and the result will be: | |
1477 | * | |
1478 | * bucket | |
1479 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
1480 | * 0 0 | |
1481 | * 1 0 | |
1482 | * 2 0 | |
1483 | * 3 0 | |
1484 | * 4 0 | |
1485 | * 5 0 | |
1486 | * 6 0 | |
1487 | * 7 0 0 | |
1488 | * | |
1489 | * once this is done, call unmap() to drop all the corresponding rule references | |
1490 | * from mapping tables. | |
1491 | */ | |
1492 | static void pipapo_drop(struct nft_pipapo_match *m, | |
1493 | union nft_pipapo_map_bucket rulemap[]) | |
1494 | { | |
1495 | struct nft_pipapo_field *f; | |
1496 | int i; | |
1497 | ||
1498 | nft_pipapo_for_each_field(f, i, m) { | |
1499 | int g; | |
1500 | ||
1501 | for (g = 0; g < f->groups; g++) { | |
1502 | unsigned long *pos; | |
1503 | int b; | |
1504 | ||
bf3e5839 SB |
1505 | pos = NFT_PIPAPO_LT_ALIGN(f->lt) + g * |
1506 | NFT_PIPAPO_BUCKETS(f->bb) * f->bsize; | |
3c4287f6 | 1507 | |
e807b13c | 1508 | for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) { |
3c4287f6 SB |
1509 | bitmap_cut(pos, pos, rulemap[i].to, |
1510 | rulemap[i].n, | |
1511 | f->bsize * BITS_PER_LONG); | |
1512 | ||
1513 | pos += f->bsize; | |
1514 | } | |
1515 | } | |
1516 | ||
1517 | pipapo_unmap(f->mt, f->rules, rulemap[i].to, rulemap[i].n, | |
1518 | rulemap[i + 1].n, i == m->field_count - 1); | |
1519 | if (pipapo_resize(f, f->rules, f->rules - rulemap[i].n)) { | |
1520 | /* We can ignore this, a failure to shrink tables down | |
1521 | * doesn't make tables invalid. | |
1522 | */ | |
1523 | ; | |
1524 | } | |
1525 | f->rules -= rulemap[i].n; | |
4051f431 SB |
1526 | |
1527 | pipapo_lt_bits_adjust(f); | |
3c4287f6 SB |
1528 | } |
1529 | } | |
1530 | ||
1531 | /** | |
1532 | * pipapo_gc() - Drop expired entries from set, destroy start and end elements | |
1533 | * @set: nftables API set representation | |
1534 | * @m: Matching data | |
1535 | */ | |
1536 | static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) | |
1537 | { | |
1538 | struct nft_pipapo *priv = nft_set_priv(set); | |
1539 | int rules_f0, first_rule = 0; | |
aaa31047 | 1540 | struct nft_pipapo_elem *e; |
3c4287f6 SB |
1541 | |
1542 | while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { | |
1543 | union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; | |
1544 | struct nft_pipapo_field *f; | |
3c4287f6 SB |
1545 | int i, start, rules_fx; |
1546 | ||
1547 | start = first_rule; | |
1548 | rules_fx = rules_f0; | |
1549 | ||
1550 | nft_pipapo_for_each_field(f, i, m) { | |
1551 | rulemap[i].to = start; | |
1552 | rulemap[i].n = rules_fx; | |
1553 | ||
1554 | if (i < m->field_count - 1) { | |
1555 | rules_fx = f->mt[start].n; | |
1556 | start = f->mt[start].to; | |
1557 | } | |
1558 | } | |
1559 | ||
1560 | /* Pick the last field, and its last index */ | |
1561 | f--; | |
1562 | i--; | |
1563 | e = f->mt[rulemap[i].to].e; | |
1564 | if (nft_set_elem_expired(&e->ext) && | |
1565 | !nft_set_elem_mark_busy(&e->ext)) { | |
1566 | priv->dirty = true; | |
1567 | pipapo_drop(m, rulemap); | |
1568 | ||
1569 | rcu_barrier(); | |
1570 | nft_set_elem_destroy(set, e, true); | |
1571 | ||
1572 | /* And check again current first rule, which is now the | |
1573 | * first we haven't checked. | |
1574 | */ | |
1575 | } else { | |
1576 | first_rule += rules_f0; | |
1577 | } | |
1578 | } | |
1579 | ||
aaa31047 PNA |
1580 | e = nft_set_catchall_gc(set); |
1581 | if (e) | |
1582 | nft_set_elem_destroy(set, e, true); | |
1583 | ||
3c4287f6 SB |
1584 | priv->last_gc = jiffies; |
1585 | } | |
1586 | ||
1587 | /** | |
1588 | * pipapo_free_fields() - Free per-field tables contained in matching data | |
1589 | * @m: Matching data | |
1590 | */ | |
1591 | static void pipapo_free_fields(struct nft_pipapo_match *m) | |
1592 | { | |
1593 | struct nft_pipapo_field *f; | |
1594 | int i; | |
1595 | ||
1596 | nft_pipapo_for_each_field(f, i, m) { | |
1597 | kvfree(f->lt); | |
1598 | kvfree(f->mt); | |
1599 | } | |
1600 | } | |
1601 | ||
1602 | /** | |
1603 | * pipapo_reclaim_match - RCU callback to free fields from old matching data | |
1604 | * @rcu: RCU head | |
1605 | */ | |
1606 | static void pipapo_reclaim_match(struct rcu_head *rcu) | |
1607 | { | |
1608 | struct nft_pipapo_match *m; | |
1609 | int i; | |
1610 | ||
1611 | m = container_of(rcu, struct nft_pipapo_match, rcu); | |
1612 | ||
1613 | for_each_possible_cpu(i) | |
1614 | kfree(*per_cpu_ptr(m->scratch, i)); | |
1615 | ||
bf3e5839 SB |
1616 | #ifdef NFT_PIPAPO_ALIGN |
1617 | free_percpu(m->scratch_aligned); | |
1618 | #endif | |
3c4287f6 SB |
1619 | free_percpu(m->scratch); |
1620 | ||
1621 | pipapo_free_fields(m); | |
1622 | ||
1623 | kfree(m); | |
1624 | } | |
1625 | ||
1626 | /** | |
1627 | * pipapo_commit() - Replace lookup data with current working copy | |
1628 | * @set: nftables API set representation | |
1629 | * | |
1630 | * While at it, check if we should perform garbage collection on the working | |
1631 | * copy before committing it for lookup, and don't replace the table if the | |
1632 | * working copy doesn't have pending changes. | |
1633 | * | |
1634 | * We also need to create a new working copy for subsequent insertions and | |
1635 | * deletions. | |
1636 | */ | |
1637 | static void pipapo_commit(const struct nft_set *set) | |
1638 | { | |
1639 | struct nft_pipapo *priv = nft_set_priv(set); | |
1640 | struct nft_pipapo_match *new_clone, *old; | |
1641 | ||
1642 | if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) | |
1643 | pipapo_gc(set, priv->clone); | |
1644 | ||
1645 | if (!priv->dirty) | |
1646 | return; | |
1647 | ||
1648 | new_clone = pipapo_clone(priv->clone); | |
1649 | if (IS_ERR(new_clone)) | |
1650 | return; | |
1651 | ||
1652 | priv->dirty = false; | |
1653 | ||
1654 | old = rcu_access_pointer(priv->match); | |
1655 | rcu_assign_pointer(priv->match, priv->clone); | |
1656 | if (old) | |
1657 | call_rcu(&old->rcu, pipapo_reclaim_match); | |
1658 | ||
1659 | priv->clone = new_clone; | |
1660 | } | |
1661 | ||
1662 | /** | |
1663 | * nft_pipapo_activate() - Mark element reference as active given key, commit | |
1664 | * @net: Network namespace | |
1665 | * @set: nftables API set representation | |
1666 | * @elem: nftables API element representation containing key data | |
1667 | * | |
1668 | * On insertion, elements are added to a copy of the matching data currently | |
1669 | * in use for lookups, and not directly inserted into current lookup data, so | |
1670 | * we'll take care of that by calling pipapo_commit() here. Both | |
1671 | * nft_pipapo_insert() and nft_pipapo_activate() are called once for each | |
1672 | * element, hence we can't purpose either one as a real commit operation. | |
1673 | */ | |
1674 | static void nft_pipapo_activate(const struct net *net, | |
1675 | const struct nft_set *set, | |
1676 | const struct nft_set_elem *elem) | |
1677 | { | |
1678 | struct nft_pipapo_elem *e; | |
1679 | ||
1680 | e = pipapo_get(net, set, (const u8 *)elem->key.val.data, 0); | |
1681 | if (IS_ERR(e)) | |
1682 | return; | |
1683 | ||
1684 | nft_set_elem_change_active(net, set, &e->ext); | |
1685 | nft_set_elem_clear_busy(&e->ext); | |
1686 | ||
1687 | pipapo_commit(set); | |
1688 | } | |
1689 | ||
1690 | /** | |
1691 | * pipapo_deactivate() - Check that element is in set, mark as inactive | |
1692 | * @net: Network namespace | |
1693 | * @set: nftables API set representation | |
1694 | * @data: Input key data | |
1695 | * @ext: nftables API extension pointer, used to check for end element | |
1696 | * | |
1697 | * This is a convenience function that can be called from both | |
1698 | * nft_pipapo_deactivate() and nft_pipapo_flush(), as they are in fact the same | |
1699 | * operation. | |
1700 | * | |
1701 | * Return: deactivated element if found, NULL otherwise. | |
1702 | */ | |
1703 | static void *pipapo_deactivate(const struct net *net, const struct nft_set *set, | |
1704 | const u8 *data, const struct nft_set_ext *ext) | |
1705 | { | |
1706 | struct nft_pipapo_elem *e; | |
1707 | ||
1708 | e = pipapo_get(net, set, data, nft_genmask_next(net)); | |
1709 | if (IS_ERR(e)) | |
1710 | return NULL; | |
1711 | ||
1712 | nft_set_elem_change_active(net, set, &e->ext); | |
1713 | ||
1714 | return e; | |
1715 | } | |
1716 | ||
1717 | /** | |
1718 | * nft_pipapo_deactivate() - Call pipapo_deactivate() to make element inactive | |
1719 | * @net: Network namespace | |
1720 | * @set: nftables API set representation | |
1721 | * @elem: nftables API element representation containing key data | |
1722 | * | |
1723 | * Return: deactivated element if found, NULL otherwise. | |
1724 | */ | |
1725 | static void *nft_pipapo_deactivate(const struct net *net, | |
1726 | const struct nft_set *set, | |
1727 | const struct nft_set_elem *elem) | |
1728 | { | |
1729 | const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); | |
1730 | ||
1731 | return pipapo_deactivate(net, set, (const u8 *)elem->key.val.data, ext); | |
1732 | } | |
1733 | ||
1734 | /** | |
1735 | * nft_pipapo_flush() - Call pipapo_deactivate() to make element inactive | |
1736 | * @net: Network namespace | |
1737 | * @set: nftables API set representation | |
1738 | * @elem: nftables API element representation containing key data | |
1739 | * | |
1740 | * This is functionally the same as nft_pipapo_deactivate(), with a slightly | |
1741 | * different interface, and it's also called once for each element in a set | |
1742 | * being flushed, so we can't implement, strictly speaking, a flush operation, | |
1743 | * which would otherwise be as simple as allocating an empty copy of the | |
1744 | * matching data. | |
1745 | * | |
1746 | * Note that we could in theory do that, mark the set as flushed, and ignore | |
1747 | * subsequent calls, but we would leak all the elements after the first one, | |
1748 | * because they wouldn't then be freed as result of API calls. | |
1749 | * | |
1750 | * Return: true if element was found and deactivated. | |
1751 | */ | |
1752 | static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set, | |
1753 | void *elem) | |
1754 | { | |
1755 | struct nft_pipapo_elem *e = elem; | |
1756 | ||
1757 | return pipapo_deactivate(net, set, (const u8 *)nft_set_ext_key(&e->ext), | |
1758 | &e->ext); | |
1759 | } | |
1760 | ||
1761 | /** | |
1762 | * pipapo_get_boundaries() - Get byte interval for associated rules | |
1763 | * @f: Field including lookup table | |
1764 | * @first_rule: First rule (lowest index) | |
1765 | * @rule_count: Number of associated rules | |
1766 | * @left: Byte expression for left boundary (start of range) | |
1767 | * @right: Byte expression for right boundary (end of range) | |
1768 | * | |
1769 | * Given the first rule and amount of rules that originated from the same entry, | |
1770 | * build the original range associated with the entry, and calculate the length | |
1771 | * of the originating netmask. | |
1772 | * | |
1773 | * In pictures: | |
1774 | * | |
1775 | * bucket | |
1776 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
1777 | * 0 1,2 | |
1778 | * 1 1,2 | |
1779 | * 2 1,2 | |
1780 | * 3 1,2 | |
1781 | * 4 1,2 | |
1782 | * 5 1 2 | |
1783 | * 6 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | |
1784 | * 7 1,2 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | |
1785 | * | |
1786 | * this is the lookup table corresponding to the IPv4 range | |
1787 | * 192.168.1.0-192.168.2.1, which was expanded to the two composing netmasks, | |
1788 | * rule #1: 192.168.1.0/24, and rule #2: 192.168.2.0/31. | |
1789 | * | |
1790 | * This function fills @left and @right with the byte values of the leftmost | |
1791 | * and rightmost bucket indices for the lowest and highest rule indices, | |
1792 | * respectively. If @first_rule is 1 and @rule_count is 2, we obtain, in | |
1793 | * nibbles: | |
1794 | * left: < 12, 0, 10, 8, 0, 1, 0, 0 > | |
1795 | * right: < 12, 0, 10, 8, 0, 2, 2, 1 > | |
1796 | * corresponding to bytes: | |
1797 | * left: < 192, 168, 1, 0 > | |
1798 | * right: < 192, 168, 2, 1 > | |
1799 | * with mask length irrelevant here, unused on return, as the range is already | |
1800 | * defined by its start and end points. The mask length is relevant for a single | |
1801 | * ranged entry instead: if @first_rule is 1 and @rule_count is 1, we ignore | |
1802 | * rule 2 above: @left becomes < 192, 168, 1, 0 >, @right becomes | |
1803 | * < 192, 168, 1, 255 >, and the mask length, calculated from the distances | |
1804 | * between leftmost and rightmost bucket indices for each group, would be 24. | |
1805 | * | |
1806 | * Return: mask length, in bits. | |
1807 | */ | |
1808 | static int pipapo_get_boundaries(struct nft_pipapo_field *f, int first_rule, | |
1809 | int rule_count, u8 *left, u8 *right) | |
1810 | { | |
e807b13c | 1811 | int g, mask_len = 0, bit_offset = 0; |
3c4287f6 | 1812 | u8 *l = left, *r = right; |
3c4287f6 SB |
1813 | |
1814 | for (g = 0; g < f->groups; g++) { | |
1815 | int b, x0, x1; | |
1816 | ||
1817 | x0 = -1; | |
1818 | x1 = -1; | |
e807b13c | 1819 | for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) { |
3c4287f6 SB |
1820 | unsigned long *pos; |
1821 | ||
bf3e5839 SB |
1822 | pos = NFT_PIPAPO_LT_ALIGN(f->lt) + |
1823 | (g * NFT_PIPAPO_BUCKETS(f->bb) + b) * f->bsize; | |
3c4287f6 SB |
1824 | if (test_bit(first_rule, pos) && x0 == -1) |
1825 | x0 = b; | |
1826 | if (test_bit(first_rule + rule_count - 1, pos)) | |
1827 | x1 = b; | |
1828 | } | |
1829 | ||
e807b13c SB |
1830 | *l |= x0 << (BITS_PER_BYTE - f->bb - bit_offset); |
1831 | *r |= x1 << (BITS_PER_BYTE - f->bb - bit_offset); | |
1832 | ||
1833 | bit_offset += f->bb; | |
1834 | if (bit_offset >= BITS_PER_BYTE) { | |
1835 | bit_offset %= BITS_PER_BYTE; | |
1836 | l++; | |
1837 | r++; | |
3c4287f6 SB |
1838 | } |
1839 | ||
1840 | if (x1 - x0 == 0) | |
1841 | mask_len += 4; | |
1842 | else if (x1 - x0 == 1) | |
1843 | mask_len += 3; | |
1844 | else if (x1 - x0 == 3) | |
1845 | mask_len += 2; | |
1846 | else if (x1 - x0 == 7) | |
1847 | mask_len += 1; | |
1848 | } | |
1849 | ||
1850 | return mask_len; | |
1851 | } | |
1852 | ||
1853 | /** | |
1854 | * pipapo_match_field() - Match rules against byte ranges | |
1855 | * @f: Field including the lookup table | |
1856 | * @first_rule: First of associated rules originating from same entry | |
1857 | * @rule_count: Amount of associated rules | |
1858 | * @start: Start of range to be matched | |
1859 | * @end: End of range to be matched | |
1860 | * | |
1861 | * Return: true on match, false otherwise. | |
1862 | */ | |
1863 | static bool pipapo_match_field(struct nft_pipapo_field *f, | |
1864 | int first_rule, int rule_count, | |
1865 | const u8 *start, const u8 *end) | |
1866 | { | |
1867 | u8 right[NFT_PIPAPO_MAX_BYTES] = { 0 }; | |
1868 | u8 left[NFT_PIPAPO_MAX_BYTES] = { 0 }; | |
1869 | ||
1870 | pipapo_get_boundaries(f, first_rule, rule_count, left, right); | |
1871 | ||
e807b13c SB |
1872 | return !memcmp(start, left, |
1873 | f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) && | |
1874 | !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)); | |
3c4287f6 SB |
1875 | } |
1876 | ||
1877 | /** | |
1878 | * nft_pipapo_remove() - Remove element given key, commit | |
1879 | * @net: Network namespace | |
1880 | * @set: nftables API set representation | |
1881 | * @elem: nftables API element representation containing key data | |
1882 | * | |
1883 | * Similarly to nft_pipapo_activate(), this is used as commit operation by the | |
1884 | * API, but it's called once per element in the pending transaction, so we can't | |
1885 | * implement this as a single commit operation. Closest we can get is to remove | |
1886 | * the matched element here, if any, and commit the updated matching data. | |
1887 | */ | |
1888 | static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, | |
1889 | const struct nft_set_elem *elem) | |
1890 | { | |
3c4287f6 SB |
1891 | struct nft_pipapo *priv = nft_set_priv(set); |
1892 | struct nft_pipapo_match *m = priv->clone; | |
212d58c1 | 1893 | struct nft_pipapo_elem *e = elem->priv; |
3c4287f6 | 1894 | int rules_f0, first_rule = 0; |
212d58c1 SB |
1895 | const u8 *data; |
1896 | ||
1897 | data = (const u8 *)nft_set_ext_key(&e->ext); | |
3c4287f6 SB |
1898 | |
1899 | e = pipapo_get(net, set, data, 0); | |
1900 | if (IS_ERR(e)) | |
1901 | return; | |
1902 | ||
1903 | while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { | |
1904 | union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; | |
1905 | const u8 *match_start, *match_end; | |
1906 | struct nft_pipapo_field *f; | |
1907 | int i, start, rules_fx; | |
1908 | ||
1909 | match_start = data; | |
1910 | match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data; | |
1911 | ||
1912 | start = first_rule; | |
1913 | rules_fx = rules_f0; | |
1914 | ||
1915 | nft_pipapo_for_each_field(f, i, m) { | |
1916 | if (!pipapo_match_field(f, start, rules_fx, | |
1917 | match_start, match_end)) | |
1918 | break; | |
1919 | ||
1920 | rulemap[i].to = start; | |
1921 | rulemap[i].n = rules_fx; | |
1922 | ||
1923 | rules_fx = f->mt[start].n; | |
1924 | start = f->mt[start].to; | |
1925 | ||
e807b13c SB |
1926 | match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); |
1927 | match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); | |
3c4287f6 SB |
1928 | } |
1929 | ||
1930 | if (i == m->field_count) { | |
1931 | priv->dirty = true; | |
1932 | pipapo_drop(m, rulemap); | |
1933 | pipapo_commit(set); | |
1934 | return; | |
1935 | } | |
1936 | ||
1937 | first_rule += rules_f0; | |
1938 | } | |
1939 | } | |
1940 | ||
1941 | /** | |
1942 | * nft_pipapo_walk() - Walk over elements | |
1943 | * @ctx: nftables API context | |
1944 | * @set: nftables API set representation | |
1945 | * @iter: Iterator | |
1946 | * | |
1947 | * As elements are referenced in the mapping array for the last field, directly | |
1948 | * scan that array: there's no need to follow rule mappings from the first | |
1949 | * field. | |
1950 | */ | |
1951 | static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, | |
1952 | struct nft_set_iter *iter) | |
1953 | { | |
1954 | struct nft_pipapo *priv = nft_set_priv(set); | |
1955 | struct nft_pipapo_match *m; | |
1956 | struct nft_pipapo_field *f; | |
1957 | int i, r; | |
1958 | ||
1959 | rcu_read_lock(); | |
1960 | m = rcu_dereference(priv->match); | |
1961 | ||
1962 | if (unlikely(!m)) | |
1963 | goto out; | |
1964 | ||
1965 | for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) | |
1966 | ; | |
1967 | ||
1968 | for (r = 0; r < f->rules; r++) { | |
1969 | struct nft_pipapo_elem *e; | |
1970 | struct nft_set_elem elem; | |
1971 | ||
1972 | if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) | |
1973 | continue; | |
1974 | ||
1975 | if (iter->count < iter->skip) | |
1976 | goto cont; | |
1977 | ||
1978 | e = f->mt[r].e; | |
1979 | if (nft_set_elem_expired(&e->ext)) | |
1980 | goto cont; | |
1981 | ||
1982 | elem.priv = e; | |
1983 | ||
1984 | iter->err = iter->fn(ctx, set, iter, &elem); | |
1985 | if (iter->err < 0) | |
1986 | goto out; | |
1987 | ||
1988 | cont: | |
1989 | iter->count++; | |
1990 | } | |
1991 | ||
1992 | out: | |
1993 | rcu_read_unlock(); | |
1994 | } | |
1995 | ||
1996 | /** | |
1997 | * nft_pipapo_privsize() - Return the size of private data for the set | |
1998 | * @nla: netlink attributes, ignored as size doesn't depend on them | |
1999 | * @desc: Set description, ignored as size doesn't depend on it | |
2000 | * | |
2001 | * Return: size of private data for this set implementation, in bytes | |
2002 | */ | |
2003 | static u64 nft_pipapo_privsize(const struct nlattr * const nla[], | |
2004 | const struct nft_set_desc *desc) | |
2005 | { | |
2006 | return sizeof(struct nft_pipapo); | |
2007 | } | |
2008 | ||
2009 | /** | |
8683f4b9 SB |
2010 | * nft_pipapo_estimate() - Set size, space and lookup complexity |
2011 | * @desc: Set description, element count and field description used | |
3c4287f6 SB |
2012 | * @features: Flags: NFT_SET_INTERVAL needs to be there |
2013 | * @est: Storage for estimation data | |
2014 | * | |
8683f4b9 | 2015 | * Return: true if set description is compatible, false otherwise |
3c4287f6 SB |
2016 | */ |
2017 | static bool nft_pipapo_estimate(const struct nft_set_desc *desc, u32 features, | |
2018 | struct nft_set_estimate *est) | |
2019 | { | |
eb16933a SB |
2020 | if (!(features & NFT_SET_INTERVAL) || |
2021 | desc->field_count < NFT_PIPAPO_MIN_FIELDS) | |
3c4287f6 SB |
2022 | return false; |
2023 | ||
8683f4b9 SB |
2024 | est->size = pipapo_estimate_size(desc); |
2025 | if (!est->size) | |
3c4287f6 SB |
2026 | return false; |
2027 | ||
3c4287f6 SB |
2028 | est->lookup = NFT_SET_CLASS_O_LOG_N; |
2029 | ||
2030 | est->space = NFT_SET_CLASS_O_N; | |
2031 | ||
2032 | return true; | |
2033 | } | |
2034 | ||
2035 | /** | |
2036 | * nft_pipapo_init() - Initialise data for a set instance | |
2037 | * @set: nftables API set representation | |
2038 | * @desc: Set description | |
2039 | * @nla: netlink attributes | |
2040 | * | |
2041 | * Validate number and size of fields passed as NFTA_SET_DESC_CONCAT netlink | |
2042 | * attributes, initialise internal set parameters, current instance of matching | |
2043 | * data and a copy for subsequent insertions. | |
2044 | * | |
2045 | * Return: 0 on success, negative error code on failure. | |
2046 | */ | |
2047 | static int nft_pipapo_init(const struct nft_set *set, | |
2048 | const struct nft_set_desc *desc, | |
2049 | const struct nlattr * const nla[]) | |
2050 | { | |
2051 | struct nft_pipapo *priv = nft_set_priv(set); | |
2052 | struct nft_pipapo_match *m; | |
2053 | struct nft_pipapo_field *f; | |
eb16933a | 2054 | int err, i, field_count; |
3c4287f6 | 2055 | |
eb16933a SB |
2056 | field_count = desc->field_count ? : 1; |
2057 | ||
2058 | if (field_count > NFT_PIPAPO_MAX_FIELDS) | |
3c4287f6 SB |
2059 | return -EINVAL; |
2060 | ||
eb16933a | 2061 | m = kmalloc(sizeof(*priv->match) + sizeof(*f) * field_count, |
3c4287f6 SB |
2062 | GFP_KERNEL); |
2063 | if (!m) | |
2064 | return -ENOMEM; | |
2065 | ||
eb16933a | 2066 | m->field_count = field_count; |
3c4287f6 SB |
2067 | m->bsize_max = 0; |
2068 | ||
2069 | m->scratch = alloc_percpu(unsigned long *); | |
2070 | if (!m->scratch) { | |
2071 | err = -ENOMEM; | |
bf3e5839 | 2072 | goto out_scratch; |
3c4287f6 SB |
2073 | } |
2074 | for_each_possible_cpu(i) | |
2075 | *per_cpu_ptr(m->scratch, i) = NULL; | |
2076 | ||
bf3e5839 SB |
2077 | #ifdef NFT_PIPAPO_ALIGN |
2078 | m->scratch_aligned = alloc_percpu(unsigned long *); | |
2079 | if (!m->scratch_aligned) { | |
2080 | err = -ENOMEM; | |
2081 | goto out_free; | |
2082 | } | |
2083 | for_each_possible_cpu(i) | |
2084 | *per_cpu_ptr(m->scratch_aligned, i) = NULL; | |
2085 | #endif | |
2086 | ||
3c4287f6 SB |
2087 | rcu_head_init(&m->rcu); |
2088 | ||
2089 | nft_pipapo_for_each_field(f, i, m) { | |
eb16933a SB |
2090 | int len = desc->field_len[i] ? : set->klen; |
2091 | ||
4051f431 | 2092 | f->bb = NFT_PIPAPO_GROUP_BITS_INIT; |
eb16933a | 2093 | f->groups = len * NFT_PIPAPO_GROUPS_PER_BYTE(f); |
3c4287f6 | 2094 | |
eb16933a | 2095 | priv->width += round_up(len, sizeof(u32)); |
3c4287f6 SB |
2096 | |
2097 | f->bsize = 0; | |
2098 | f->rules = 0; | |
bf3e5839 | 2099 | NFT_PIPAPO_LT_ASSIGN(f, NULL); |
3c4287f6 SB |
2100 | f->mt = NULL; |
2101 | } | |
2102 | ||
2103 | /* Create an initial clone of matching data for next insertion */ | |
2104 | priv->clone = pipapo_clone(m); | |
2105 | if (IS_ERR(priv->clone)) { | |
2106 | err = PTR_ERR(priv->clone); | |
2107 | goto out_free; | |
2108 | } | |
2109 | ||
2110 | priv->dirty = false; | |
2111 | ||
2112 | rcu_assign_pointer(priv->match, m); | |
2113 | ||
2114 | return 0; | |
2115 | ||
2116 | out_free: | |
bf3e5839 SB |
2117 | #ifdef NFT_PIPAPO_ALIGN |
2118 | free_percpu(m->scratch_aligned); | |
2119 | #endif | |
3c4287f6 | 2120 | free_percpu(m->scratch); |
bf3e5839 | 2121 | out_scratch: |
3c4287f6 SB |
2122 | kfree(m); |
2123 | ||
2124 | return err; | |
2125 | } | |
2126 | ||
9827a0e6 PNA |
2127 | /** |
2128 | * nft_set_pipapo_match_destroy() - Destroy elements from key mapping array | |
2129 | * @set: nftables API set representation | |
2130 | * @m: matching data pointing to key mapping array | |
2131 | */ | |
2132 | static void nft_set_pipapo_match_destroy(const struct nft_set *set, | |
2133 | struct nft_pipapo_match *m) | |
2134 | { | |
2135 | struct nft_pipapo_field *f; | |
2136 | int i, r; | |
2137 | ||
2138 | for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) | |
2139 | ; | |
2140 | ||
2141 | for (r = 0; r < f->rules; r++) { | |
2142 | struct nft_pipapo_elem *e; | |
2143 | ||
2144 | if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) | |
2145 | continue; | |
2146 | ||
2147 | e = f->mt[r].e; | |
2148 | ||
2149 | nft_set_elem_destroy(set, e, true); | |
2150 | } | |
2151 | } | |
2152 | ||
3c4287f6 SB |
2153 | /** |
2154 | * nft_pipapo_destroy() - Free private data for set and all committed elements | |
2155 | * @set: nftables API set representation | |
2156 | */ | |
2157 | static void nft_pipapo_destroy(const struct nft_set *set) | |
2158 | { | |
2159 | struct nft_pipapo *priv = nft_set_priv(set); | |
2160 | struct nft_pipapo_match *m; | |
9827a0e6 | 2161 | int cpu; |
3c4287f6 SB |
2162 | |
2163 | m = rcu_dereference_protected(priv->match, true); | |
2164 | if (m) { | |
2165 | rcu_barrier(); | |
2166 | ||
9827a0e6 | 2167 | nft_set_pipapo_match_destroy(set, m); |
3c4287f6 | 2168 | |
bf3e5839 SB |
2169 | #ifdef NFT_PIPAPO_ALIGN |
2170 | free_percpu(m->scratch_aligned); | |
2171 | #endif | |
3c4287f6 SB |
2172 | for_each_possible_cpu(cpu) |
2173 | kfree(*per_cpu_ptr(m->scratch, cpu)); | |
2174 | free_percpu(m->scratch); | |
3c4287f6 SB |
2175 | pipapo_free_fields(m); |
2176 | kfree(m); | |
2177 | priv->match = NULL; | |
2178 | } | |
2179 | ||
2180 | if (priv->clone) { | |
9827a0e6 PNA |
2181 | m = priv->clone; |
2182 | ||
2183 | if (priv->dirty) | |
2184 | nft_set_pipapo_match_destroy(set, m); | |
2185 | ||
bf3e5839 SB |
2186 | #ifdef NFT_PIPAPO_ALIGN |
2187 | free_percpu(priv->clone->scratch_aligned); | |
2188 | #endif | |
3c4287f6 SB |
2189 | for_each_possible_cpu(cpu) |
2190 | kfree(*per_cpu_ptr(priv->clone->scratch, cpu)); | |
2191 | free_percpu(priv->clone->scratch); | |
2192 | ||
2193 | pipapo_free_fields(priv->clone); | |
2194 | kfree(priv->clone); | |
2195 | priv->clone = NULL; | |
2196 | } | |
2197 | } | |
2198 | ||
2199 | /** | |
2200 | * nft_pipapo_gc_init() - Initialise garbage collection | |
2201 | * @set: nftables API set representation | |
2202 | * | |
2203 | * Instead of actually setting up a periodic work for garbage collection, as | |
2204 | * this operation requires a swap of matching data with the working copy, we'll | |
2205 | * do that opportunistically with other commit operations if the interval is | |
2206 | * elapsed, so we just need to set the current jiffies timestamp here. | |
2207 | */ | |
2208 | static void nft_pipapo_gc_init(const struct nft_set *set) | |
2209 | { | |
2210 | struct nft_pipapo *priv = nft_set_priv(set); | |
2211 | ||
2212 | priv->last_gc = jiffies; | |
2213 | } | |
2214 | ||
24d19826 | 2215 | const struct nft_set_type nft_set_pipapo_type = { |
3c4287f6 SB |
2216 | .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | |
2217 | NFT_SET_TIMEOUT, | |
2218 | .ops = { | |
2219 | .lookup = nft_pipapo_lookup, | |
2220 | .insert = nft_pipapo_insert, | |
2221 | .activate = nft_pipapo_activate, | |
2222 | .deactivate = nft_pipapo_deactivate, | |
2223 | .flush = nft_pipapo_flush, | |
2224 | .remove = nft_pipapo_remove, | |
2225 | .walk = nft_pipapo_walk, | |
2226 | .get = nft_pipapo_get, | |
2227 | .privsize = nft_pipapo_privsize, | |
2228 | .estimate = nft_pipapo_estimate, | |
2229 | .init = nft_pipapo_init, | |
2230 | .destroy = nft_pipapo_destroy, | |
2231 | .gc_init = nft_pipapo_gc_init, | |
2232 | .elemsize = offsetof(struct nft_pipapo_elem, ext), | |
2233 | }, | |
2234 | }; | |
7400b063 | 2235 | |
e6abef61 | 2236 | #if defined(CONFIG_X86_64) && !defined(CONFIG_UML) |
7400b063 SB |
2237 | const struct nft_set_type nft_set_pipapo_avx2_type = { |
2238 | .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | | |
2239 | NFT_SET_TIMEOUT, | |
2240 | .ops = { | |
2241 | .lookup = nft_pipapo_avx2_lookup, | |
2242 | .insert = nft_pipapo_insert, | |
2243 | .activate = nft_pipapo_activate, | |
2244 | .deactivate = nft_pipapo_deactivate, | |
2245 | .flush = nft_pipapo_flush, | |
2246 | .remove = nft_pipapo_remove, | |
2247 | .walk = nft_pipapo_walk, | |
2248 | .get = nft_pipapo_get, | |
2249 | .privsize = nft_pipapo_privsize, | |
2250 | .estimate = nft_pipapo_avx2_estimate, | |
2251 | .init = nft_pipapo_init, | |
2252 | .destroy = nft_pipapo_destroy, | |
2253 | .gc_init = nft_pipapo_gc_init, | |
2254 | .elemsize = offsetof(struct nft_pipapo_elem, ext), | |
2255 | }, | |
2256 | }; | |
2257 | #endif |