Commit | Line | Data |
---|---|---|
3c4287f6 SB |
1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | ||
3 | /* PIPAPO: PIle PAcket POlicies: set for arbitrary concatenations of ranges | |
4 | * | |
5 | * Copyright (c) 2019-2020 Red Hat GmbH | |
6 | * | |
7 | * Author: Stefano Brivio <sbrivio@redhat.com> | |
8 | */ | |
9 | ||
10 | /** | |
11 | * DOC: Theory of Operation | |
12 | * | |
13 | * | |
14 | * Problem | |
15 | * ------- | |
16 | * | |
17 | * Match packet bytes against entries composed of ranged or non-ranged packet | |
18 | * field specifiers, mapping them to arbitrary references. For example: | |
19 | * | |
20 | * :: | |
21 | * | |
22 | * --- fields ---> | |
23 | * | [net],[port],[net]... => [reference] | |
24 | * entries [net],[port],[net]... => [reference] | |
25 | * | [net],[port],[net]... => [reference] | |
26 | * V ... | |
27 | * | |
28 | * where [net] fields can be IP ranges or netmasks, and [port] fields are port | |
29 | * ranges. Arbitrary packet fields can be matched. | |
30 | * | |
31 | * | |
32 | * Algorithm Overview | |
33 | * ------------------ | |
34 | * | |
35 | * This algorithm is loosely inspired by [Ligatti 2010], and fundamentally | |
36 | * relies on the consideration that every contiguous range in a space of b bits | |
37 | * can be converted into b * 2 netmasks, from Theorem 3 in [Rottenstreich 2010], | |
38 | * as also illustrated in Section 9 of [Kogan 2014]. | |
39 | * | |
40 | * Classification against a number of entries, that require matching given bits | |
41 | * of a packet field, is performed by grouping those bits in sets of arbitrary | |
42 | * size, and classifying packet bits one group at a time. | |
43 | * | |
44 | * Example: | |
45 | * to match the source port (16 bits) of a packet, we can divide those 16 bits | |
46 | * in 4 groups of 4 bits each. Given the entry: | |
47 | * 0000 0001 0101 1001 | |
48 | * and a packet with source port: | |
49 | * 0000 0001 1010 1001 | |
50 | * first and second groups match, but the third doesn't. We conclude that the | |
51 | * packet doesn't match the given entry. | |
52 | * | |
53 | * Translate the set to a sequence of lookup tables, one per field. Each table | |
54 | * has two dimensions: bit groups to be matched for a single packet field, and | |
55 | * all the possible values of said groups (buckets). Input entries are | |
56 | * represented as one or more rules, depending on the number of composing | |
57 | * netmasks for the given field specifier, and a group match is indicated as a | |
58 | * set bit, with number corresponding to the rule index, in all the buckets | |
59 | * whose value matches the entry for a given group. | |
60 | * | |
61 | * Rules are mapped between fields through an array of x, n pairs, with each | |
62 | * item mapping a matched rule to one or more rules. The position of the pair in | |
63 | * the array indicates the matched rule to be mapped to the next field, x | |
64 | * indicates the first rule index in the next field, and n the amount of | |
65 | * next-field rules the current rule maps to. | |
66 | * | |
67 | * The mapping array for the last field maps to the desired references. | |
68 | * | |
69 | * To match, we perform table lookups using the values of grouped packet bits, | |
70 | * and use a sequence of bitwise operations to progressively evaluate rule | |
71 | * matching. | |
72 | * | |
73 | * A stand-alone, reference implementation, also including notes about possible | |
74 | * future optimisations, is available at: | |
75 | * https://pipapo.lameexcu.se/ | |
76 | * | |
77 | * Insertion | |
78 | * --------- | |
79 | * | |
80 | * - For each packet field: | |
81 | * | |
82 | * - divide the b packet bits we want to classify into groups of size t, | |
83 | * obtaining ceil(b / t) groups | |
84 | * | |
85 | * Example: match on destination IP address, with t = 4: 32 bits, 8 groups | |
86 | * of 4 bits each | |
87 | * | |
88 | * - allocate a lookup table with one column ("bucket") for each possible | |
89 | * value of a group, and with one row for each group | |
90 | * | |
91 | * Example: 8 groups, 2^4 buckets: | |
92 | * | |
93 | * :: | |
94 | * | |
95 | * bucket | |
96 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
97 | * 0 | |
98 | * 1 | |
99 | * 2 | |
100 | * 3 | |
101 | * 4 | |
102 | * 5 | |
103 | * 6 | |
104 | * 7 | |
105 | * | |
106 | * - map the bits we want to classify for the current field, for a given | |
107 | * entry, to a single rule for non-ranged and netmask set items, and to one | |
108 | * or multiple rules for ranges. Ranges are expanded to composing netmasks | |
109 | * by pipapo_expand(). | |
110 | * | |
111 | * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048 | |
112 | * - rule #0: 10.0.0.5 | |
113 | * - rule #1: 192.168.1.0/24 | |
114 | * - rule #2: 192.168.2.0/31 | |
115 | * | |
116 | * - insert references to the rules in the lookup table, selecting buckets | |
117 | * according to bit values of a rule in the given group. This is done by | |
118 | * pipapo_insert(). | |
119 | * | |
120 | * Example: given: | |
121 | * - rule #0: 10.0.0.5 mapping to buckets | |
122 | * < 0 10 0 0 0 0 0 5 > | |
123 | * - rule #1: 192.168.1.0/24 mapping to buckets | |
124 | * < 12 0 10 8 0 1 < 0..15 > < 0..15 > > | |
125 | * - rule #2: 192.168.2.0/31 mapping to buckets | |
126 | * < 12 0 10 8 0 2 0 < 0..1 > > | |
127 | * | |
128 | * these bits are set in the lookup table: | |
129 | * | |
130 | * :: | |
131 | * | |
132 | * bucket | |
133 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
134 | * 0 0 1,2 | |
135 | * 1 1,2 0 | |
136 | * 2 0 1,2 | |
137 | * 3 0 1,2 | |
138 | * 4 0,1,2 | |
139 | * 5 0 1 2 | |
140 | * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | |
141 | * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 | |
142 | * | |
143 | * - if this is not the last field in the set, fill a mapping array that maps | |
144 | * rules from the lookup table to rules belonging to the same entry in | |
145 | * the next lookup table, done by pipapo_map(). | |
146 | * | |
147 | * Note that as rules map to contiguous ranges of rules, given how netmask | |
148 | * expansion and insertion is performed, &union nft_pipapo_map_bucket stores | |
149 | * this information as pairs of first rule index, rule count. | |
150 | * | |
151 | * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048, | |
152 | * given lookup table #0 for field 0 (see example above): | |
153 | * | |
154 | * :: | |
155 | * | |
156 | * bucket | |
157 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
158 | * 0 0 1,2 | |
159 | * 1 1,2 0 | |
160 | * 2 0 1,2 | |
161 | * 3 0 1,2 | |
162 | * 4 0,1,2 | |
163 | * 5 0 1 2 | |
164 | * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | |
165 | * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 | |
166 | * | |
167 | * and lookup table #1 for field 1 with: | |
168 | * - rule #0: 1024 mapping to buckets | |
169 | * < 0 0 4 0 > | |
170 | * - rule #1: 2048 mapping to buckets | |
171 | * < 0 0 5 0 > | |
172 | * | |
173 | * :: | |
174 | * | |
175 | * bucket | |
176 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
177 | * 0 0,1 | |
178 | * 1 0,1 | |
179 | * 2 0 1 | |
180 | * 3 0,1 | |
181 | * | |
182 | * we need to map rules for 10.0.0.5 in lookup table #0 (rule #0) to 1024 | |
183 | * in lookup table #1 (rule #0) and rules for 192.168.1.0-192.168.2.1 | |
184 | * (rules #1, #2) to 2048 in lookup table #2 (rule #1): | |
185 | * | |
186 | * :: | |
187 | * | |
188 | * rule indices in current field: 0 1 2 | |
189 | * map to rules in next field: 0 1 1 | |
190 | * | |
191 | * - if this is the last field in the set, fill a mapping array that maps | |
192 | * rules from the last lookup table to element pointers, also done by | |
193 | * pipapo_map(). | |
194 | * | |
195 | * Note that, in this implementation, we have two elements (start, end) for | |
196 | * each entry. The pointer to the end element is stored in this array, and | |
197 | * the pointer to the start element is linked from it. | |
198 | * | |
199 | * Example: entry 10.0.0.5:1024 has a corresponding &struct nft_pipapo_elem | |
200 | * pointer, 0x66, and element for 192.168.1.0-192.168.2.1:2048 is at 0x42. | |
201 | * From the rules of lookup table #1 as mapped above: | |
202 | * | |
203 | * :: | |
204 | * | |
205 | * rule indices in last field: 0 1 | |
206 | * map to elements: 0x42 0x66 | |
207 | * | |
208 | * | |
209 | * Matching | |
210 | * -------- | |
211 | * | |
212 | * We use a result bitmap, with the size of a single lookup table bucket, to | |
213 | * represent the matching state that applies at every algorithm step. This is | |
214 | * done by pipapo_lookup(). | |
215 | * | |
216 | * - For each packet field: | |
217 | * | |
218 | * - start with an all-ones result bitmap (res_map in pipapo_lookup()) | |
219 | * | |
220 | * - perform a lookup into the table corresponding to the current field, | |
221 | * for each group, and at every group, AND the current result bitmap with | |
222 | * the value from the lookup table bucket | |
223 | * | |
224 | * :: | |
225 | * | |
226 | * Example: 192.168.1.5 < 12 0 10 8 0 1 0 5 >, with lookup table from | |
227 | * insertion examples. | |
228 | * Lookup table buckets are at least 3 bits wide, we'll assume 8 bits for | |
229 | * convenience in this example. Initial result bitmap is 0xff, the steps | |
230 | * below show the value of the result bitmap after each group is processed: | |
231 | * | |
232 | * bucket | |
233 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
234 | * 0 0 1,2 | |
235 | * result bitmap is now: 0xff & 0x6 [bucket 12] = 0x6 | |
236 | * | |
237 | * 1 1,2 0 | |
238 | * result bitmap is now: 0x6 & 0x6 [bucket 0] = 0x6 | |
239 | * | |
240 | * 2 0 1,2 | |
241 | * result bitmap is now: 0x6 & 0x6 [bucket 10] = 0x6 | |
242 | * | |
243 | * 3 0 1,2 | |
244 | * result bitmap is now: 0x6 & 0x6 [bucket 8] = 0x6 | |
245 | * | |
246 | * 4 0,1,2 | |
247 | * result bitmap is now: 0x6 & 0x7 [bucket 0] = 0x6 | |
248 | * | |
249 | * 5 0 1 2 | |
250 | * result bitmap is now: 0x6 & 0x2 [bucket 1] = 0x2 | |
251 | * | |
252 | * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | |
253 | * result bitmap is now: 0x2 & 0x7 [bucket 0] = 0x2 | |
254 | * | |
255 | * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 | |
256 | * final result bitmap for this field is: 0x2 & 0x3 [bucket 5] = 0x2 | |
257 | * | |
258 | * - at the next field, start with a new, all-zeroes result bitmap. For each | |
259 | * bit set in the previous result bitmap, fill the new result bitmap | |
260 | * (fill_map in pipapo_lookup()) with the rule indices from the | |
261 | * corresponding buckets of the mapping field for this field, done by | |
262 | * pipapo_refill() | |
263 | * | |
264 | * Example: with mapping table from insertion examples, with the current | |
265 | * result bitmap from the previous example, 0x02: | |
266 | * | |
267 | * :: | |
268 | * | |
269 | * rule indices in current field: 0 1 2 | |
270 | * map to rules in next field: 0 1 1 | |
271 | * | |
272 | * the new result bitmap will be 0x02: rule 1 was set, and rule 1 will be | |
273 | * set. | |
274 | * | |
275 | * We can now extend this example to cover the second iteration of the step | |
276 | * above (lookup and AND bitmap): assuming the port field is | |
277 | * 2048 < 0 0 5 0 >, with starting result bitmap 0x2, and lookup table | |
278 | * for "port" field from pre-computation example: | |
279 | * | |
280 | * :: | |
281 | * | |
282 | * bucket | |
283 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
284 | * 0 0,1 | |
285 | * 1 0,1 | |
286 | * 2 0 1 | |
287 | * 3 0,1 | |
288 | * | |
289 | * operations are: 0x2 & 0x3 [bucket 0] & 0x3 [bucket 0] & 0x2 [bucket 5] | |
290 | * & 0x3 [bucket 0], resulting bitmap is 0x2. | |
291 | * | |
292 | * - if this is the last field in the set, look up the value from the mapping | |
293 | * array corresponding to the final result bitmap | |
294 | * | |
295 | * Example: 0x2 resulting bitmap from 192.168.1.5:2048, mapping array for | |
296 | * last field from insertion example: | |
297 | * | |
298 | * :: | |
299 | * | |
300 | * rule indices in last field: 0 1 | |
301 | * map to elements: 0x42 0x66 | |
302 | * | |
303 | * the matching element is at 0x42. | |
304 | * | |
305 | * | |
306 | * References | |
307 | * ---------- | |
308 | * | |
309 | * [Ligatti 2010] | |
310 | * A Packet-classification Algorithm for Arbitrary Bitmask Rules, with | |
311 | * Automatic Time-space Tradeoffs | |
312 | * Jay Ligatti, Josh Kuhn, and Chris Gage. | |
313 | * Proceedings of the IEEE International Conference on Computer | |
314 | * Communication Networks (ICCCN), August 2010. | |
315 | * http://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf | |
316 | * | |
317 | * [Rottenstreich 2010] | |
318 | * Worst-Case TCAM Rule Expansion | |
319 | * Ori Rottenstreich and Isaac Keslassy. | |
320 | * 2010 Proceedings IEEE INFOCOM, San Diego, CA, 2010. | |
321 | * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.212.4592&rep=rep1&type=pdf | |
322 | * | |
323 | * [Kogan 2014] | |
324 | * SAX-PAC (Scalable And eXpressive PAcket Classification) | |
325 | * Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane, | |
326 | * and Patrick Eugster. | |
327 | * Proceedings of the 2014 ACM conference on SIGCOMM, August 2014. | |
328 | * http://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf | |
329 | */ | |
330 | ||
331 | #include <linux/kernel.h> | |
332 | #include <linux/init.h> | |
333 | #include <linux/log2.h> | |
334 | #include <linux/module.h> | |
335 | #include <linux/netlink.h> | |
336 | #include <linux/netfilter.h> | |
337 | #include <linux/netfilter/nf_tables.h> | |
338 | #include <net/netfilter/nf_tables_core.h> | |
339 | #include <uapi/linux/netfilter/nf_tables.h> | |
340 | #include <net/ipv6.h> /* For the maximum length of a field */ | |
341 | #include <linux/bitmap.h> | |
342 | #include <linux/bitops.h> | |
343 | ||
344 | /* Count of concatenated fields depends on count of 32-bit nftables registers */ | |
345 | #define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT | |
346 | ||
347 | /* Largest supported field size */ | |
348 | #define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr)) | |
349 | #define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE) | |
350 | ||
351 | /* Number of bits to be grouped together in lookup table buckets, arbitrary */ | |
352 | #define NFT_PIPAPO_GROUP_BITS 4 | |
353 | #define NFT_PIPAPO_GROUPS_PER_BYTE (BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS) | |
354 | ||
355 | /* Fields are padded to 32 bits in input registers */ | |
356 | #define NFT_PIPAPO_GROUPS_PADDED_SIZE(x) \ | |
357 | (round_up((x) / NFT_PIPAPO_GROUPS_PER_BYTE, sizeof(u32))) | |
358 | #define NFT_PIPAPO_GROUPS_PADDING(x) \ | |
359 | (NFT_PIPAPO_GROUPS_PADDED_SIZE((x)) - (x) / NFT_PIPAPO_GROUPS_PER_BYTE) | |
360 | ||
361 | /* Number of buckets, given by 2 ^ n, with n grouped bits */ | |
362 | #define NFT_PIPAPO_BUCKETS (1 << NFT_PIPAPO_GROUP_BITS) | |
363 | ||
364 | /* Each n-bit range maps to up to n * 2 rules */ | |
365 | #define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2)) | |
366 | ||
367 | /* Use the rest of mapping table buckets for rule indices, but it makes no sense | |
368 | * to exceed 32 bits | |
369 | */ | |
370 | #if BITS_PER_LONG == 64 | |
371 | #define NFT_PIPAPO_MAP_TOBITS 32 | |
372 | #else | |
373 | #define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS) | |
374 | #endif | |
375 | ||
376 | /* ...which gives us the highest allowed index for a rule */ | |
377 | #define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \ | |
378 | - (1UL << NFT_PIPAPO_MAP_NBITS)) | |
379 | ||
380 | #define nft_pipapo_for_each_field(field, index, match) \ | |
381 | for ((field) = (match)->f, (index) = 0; \ | |
382 | (index) < (match)->field_count; \ | |
383 | (index)++, (field)++) | |
384 | ||
385 | /** | |
386 | * union nft_pipapo_map_bucket - Bucket of mapping table | |
387 | * @to: First rule number (in next field) this rule maps to | |
388 | * @n: Number of rules (in next field) this rule maps to | |
389 | * @e: If there's no next field, pointer to element this rule maps to | |
390 | */ | |
391 | union nft_pipapo_map_bucket { | |
392 | struct { | |
393 | #if BITS_PER_LONG == 64 | |
394 | static_assert(NFT_PIPAPO_MAP_TOBITS <= 32); | |
395 | u32 to; | |
396 | ||
397 | static_assert(NFT_PIPAPO_MAP_NBITS <= 32); | |
398 | u32 n; | |
399 | #else | |
400 | unsigned long to:NFT_PIPAPO_MAP_TOBITS; | |
401 | unsigned long n:NFT_PIPAPO_MAP_NBITS; | |
402 | #endif | |
403 | }; | |
404 | struct nft_pipapo_elem *e; | |
405 | }; | |
406 | ||
407 | /** | |
408 | * struct nft_pipapo_field - Lookup, mapping tables and related data for a field | |
409 | * @groups: Amount of 4-bit groups | |
410 | * @rules: Number of inserted rules | |
411 | * @bsize: Size of each bucket in lookup table, in longs | |
412 | * @lt: Lookup table: 'groups' rows of NFT_PIPAPO_BUCKETS buckets | |
413 | * @mt: Mapping table: one bucket per rule | |
414 | */ | |
415 | struct nft_pipapo_field { | |
416 | int groups; | |
417 | unsigned long rules; | |
418 | size_t bsize; | |
419 | unsigned long *lt; | |
420 | union nft_pipapo_map_bucket *mt; | |
421 | }; | |
422 | ||
423 | /** | |
424 | * struct nft_pipapo_match - Data used for lookup and matching | |
425 | * @field_count Amount of fields in set | |
426 | * @scratch: Preallocated per-CPU maps for partial matching results | |
427 | * @bsize_max: Maximum lookup table bucket size of all fields, in longs | |
428 | * @rcu Matching data is swapped on commits | |
429 | * @f: Fields, with lookup and mapping tables | |
430 | */ | |
431 | struct nft_pipapo_match { | |
432 | int field_count; | |
433 | unsigned long * __percpu *scratch; | |
434 | size_t bsize_max; | |
435 | struct rcu_head rcu; | |
436 | struct nft_pipapo_field f[0]; | |
437 | }; | |
438 | ||
439 | /* Current working bitmap index, toggled between field matches */ | |
440 | static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index); | |
441 | ||
442 | /** | |
443 | * struct nft_pipapo - Representation of a set | |
444 | * @match: Currently in-use matching data | |
445 | * @clone: Copy where pending insertions and deletions are kept | |
446 | * @groups: Total amount of 4-bit groups for fields in this set | |
447 | * @width: Total bytes to be matched for one packet, including padding | |
448 | * @dirty: Working copy has pending insertions or deletions | |
449 | * @last_gc: Timestamp of last garbage collection run, jiffies | |
450 | */ | |
451 | struct nft_pipapo { | |
452 | struct nft_pipapo_match __rcu *match; | |
453 | struct nft_pipapo_match *clone; | |
454 | int groups; | |
455 | int width; | |
456 | bool dirty; | |
457 | unsigned long last_gc; | |
458 | }; | |
459 | ||
460 | struct nft_pipapo_elem; | |
461 | ||
462 | /** | |
463 | * struct nft_pipapo_elem - API-facing representation of single set element | |
464 | * @ext: nftables API extensions | |
465 | */ | |
466 | struct nft_pipapo_elem { | |
467 | struct nft_set_ext ext; | |
468 | }; | |
469 | ||
470 | /** | |
471 | * pipapo_refill() - For each set bit, set bits from selected mapping table item | |
472 | * @map: Bitmap to be scanned for set bits | |
473 | * @len: Length of bitmap in longs | |
474 | * @rules: Number of rules in field | |
475 | * @dst: Destination bitmap | |
476 | * @mt: Mapping table containing bit set specifiers | |
477 | * @match_only: Find a single bit and return, don't fill | |
478 | * | |
479 | * Iteration over set bits with __builtin_ctzl(): Daniel Lemire, public domain. | |
480 | * | |
481 | * For each bit set in map, select the bucket from mapping table with index | |
482 | * corresponding to the position of the bit set. Use start bit and amount of | |
483 | * bits specified in bucket to fill region in dst. | |
484 | * | |
485 | * Return: -1 on no match, bit position on 'match_only', 0 otherwise. | |
486 | */ | |
487 | static int pipapo_refill(unsigned long *map, int len, int rules, | |
488 | unsigned long *dst, union nft_pipapo_map_bucket *mt, | |
489 | bool match_only) | |
490 | { | |
491 | unsigned long bitset; | |
492 | int k, ret = -1; | |
493 | ||
494 | for (k = 0; k < len; k++) { | |
495 | bitset = map[k]; | |
496 | while (bitset) { | |
497 | unsigned long t = bitset & -bitset; | |
498 | int r = __builtin_ctzl(bitset); | |
499 | int i = k * BITS_PER_LONG + r; | |
500 | ||
501 | if (unlikely(i >= rules)) { | |
502 | map[k] = 0; | |
503 | return -1; | |
504 | } | |
505 | ||
506 | if (unlikely(match_only)) { | |
507 | bitmap_clear(map, i, 1); | |
508 | return i; | |
509 | } | |
510 | ||
511 | ret = 0; | |
512 | ||
513 | bitmap_set(dst, mt[i].to, mt[i].n); | |
514 | ||
515 | bitset ^= t; | |
516 | } | |
517 | map[k] = 0; | |
518 | } | |
519 | ||
520 | return ret; | |
521 | } | |
522 | ||
523 | /** | |
524 | * nft_pipapo_lookup() - Lookup function | |
525 | * @net: Network namespace | |
526 | * @set: nftables API set representation | |
527 | * @elem: nftables API element representation containing key data | |
528 | * @ext: nftables API extension pointer, filled with matching reference | |
529 | * | |
530 | * For more details, see DOC: Theory of Operation. | |
531 | * | |
532 | * Return: true on match, false otherwise. | |
533 | */ | |
534 | static bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, | |
535 | const u32 *key, const struct nft_set_ext **ext) | |
536 | { | |
537 | struct nft_pipapo *priv = nft_set_priv(set); | |
538 | unsigned long *res_map, *fill_map; | |
539 | u8 genmask = nft_genmask_cur(net); | |
540 | const u8 *rp = (const u8 *)key; | |
541 | struct nft_pipapo_match *m; | |
542 | struct nft_pipapo_field *f; | |
543 | bool map_index; | |
544 | int i; | |
545 | ||
546 | local_bh_disable(); | |
547 | ||
548 | map_index = raw_cpu_read(nft_pipapo_scratch_index); | |
549 | ||
550 | m = rcu_dereference(priv->match); | |
551 | ||
552 | if (unlikely(!m || !*raw_cpu_ptr(m->scratch))) | |
553 | goto out; | |
554 | ||
555 | res_map = *raw_cpu_ptr(m->scratch) + (map_index ? m->bsize_max : 0); | |
556 | fill_map = *raw_cpu_ptr(m->scratch) + (map_index ? 0 : m->bsize_max); | |
557 | ||
558 | memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); | |
559 | ||
560 | nft_pipapo_for_each_field(f, i, m) { | |
561 | bool last = i == m->field_count - 1; | |
562 | unsigned long *lt = f->lt; | |
563 | int b, group; | |
564 | ||
565 | /* For each 4-bit group: select lookup table bucket depending on | |
566 | * packet bytes value, then AND bucket value | |
567 | */ | |
568 | for (group = 0; group < f->groups; group += 2) { | |
569 | u8 v; | |
570 | ||
571 | v = *rp >> 4; | |
572 | __bitmap_and(res_map, res_map, lt + v * f->bsize, | |
573 | f->bsize * BITS_PER_LONG); | |
574 | lt += f->bsize * NFT_PIPAPO_BUCKETS; | |
575 | ||
576 | v = *rp & 0x0f; | |
577 | rp++; | |
578 | __bitmap_and(res_map, res_map, lt + v * f->bsize, | |
579 | f->bsize * BITS_PER_LONG); | |
580 | lt += f->bsize * NFT_PIPAPO_BUCKETS; | |
581 | } | |
582 | ||
583 | /* Now populate the bitmap for the next field, unless this is | |
584 | * the last field, in which case return the matched 'ext' | |
585 | * pointer if any. | |
586 | * | |
587 | * Now res_map contains the matching bitmap, and fill_map is the | |
588 | * bitmap for the next field. | |
589 | */ | |
590 | next_match: | |
591 | b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, | |
592 | last); | |
593 | if (b < 0) { | |
594 | raw_cpu_write(nft_pipapo_scratch_index, map_index); | |
595 | local_bh_enable(); | |
596 | ||
597 | return false; | |
598 | } | |
599 | ||
600 | if (last) { | |
601 | *ext = &f->mt[b].e->ext; | |
602 | if (unlikely(nft_set_elem_expired(*ext) || | |
603 | !nft_set_elem_active(*ext, genmask))) | |
604 | goto next_match; | |
605 | ||
606 | /* Last field: we're just returning the key without | |
607 | * filling the initial bitmap for the next field, so the | |
608 | * current inactive bitmap is clean and can be reused as | |
609 | * *next* bitmap (not initial) for the next packet. | |
610 | */ | |
611 | raw_cpu_write(nft_pipapo_scratch_index, map_index); | |
612 | local_bh_enable(); | |
613 | ||
614 | return true; | |
615 | } | |
616 | ||
617 | /* Swap bitmap indices: res_map is the initial bitmap for the | |
618 | * next field, and fill_map is guaranteed to be all-zeroes at | |
619 | * this point. | |
620 | */ | |
621 | map_index = !map_index; | |
622 | swap(res_map, fill_map); | |
623 | ||
624 | rp += NFT_PIPAPO_GROUPS_PADDING(f->groups); | |
625 | } | |
626 | ||
627 | out: | |
628 | local_bh_enable(); | |
629 | return false; | |
630 | } | |
631 | ||
632 | /** | |
633 | * pipapo_get() - Get matching element reference given key data | |
634 | * @net: Network namespace | |
635 | * @set: nftables API set representation | |
636 | * @data: Key data to be matched against existing elements | |
637 | * @genmask: If set, check that element is active in given genmask | |
638 | * | |
639 | * This is essentially the same as the lookup function, except that it matches | |
640 | * key data against the uncommitted copy and doesn't use preallocated maps for | |
641 | * bitmap results. | |
642 | * | |
643 | * Return: pointer to &struct nft_pipapo_elem on match, error pointer otherwise. | |
644 | */ | |
645 | static struct nft_pipapo_elem *pipapo_get(const struct net *net, | |
646 | const struct nft_set *set, | |
647 | const u8 *data, u8 genmask) | |
648 | { | |
649 | struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT); | |
650 | struct nft_pipapo *priv = nft_set_priv(set); | |
651 | struct nft_pipapo_match *m = priv->clone; | |
652 | unsigned long *res_map, *fill_map = NULL; | |
653 | struct nft_pipapo_field *f; | |
654 | int i; | |
655 | ||
656 | res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); | |
657 | if (!res_map) { | |
658 | ret = ERR_PTR(-ENOMEM); | |
659 | goto out; | |
660 | } | |
661 | ||
662 | fill_map = kcalloc(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); | |
663 | if (!fill_map) { | |
664 | ret = ERR_PTR(-ENOMEM); | |
665 | goto out; | |
666 | } | |
667 | ||
668 | memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); | |
669 | ||
670 | nft_pipapo_for_each_field(f, i, m) { | |
671 | bool last = i == m->field_count - 1; | |
672 | unsigned long *lt = f->lt; | |
673 | int b, group; | |
674 | ||
675 | /* For each 4-bit group: select lookup table bucket depending on | |
676 | * packet bytes value, then AND bucket value | |
677 | */ | |
678 | for (group = 0; group < f->groups; group++) { | |
679 | u8 v; | |
680 | ||
681 | if (group % 2) { | |
682 | v = *data & 0x0f; | |
683 | data++; | |
684 | } else { | |
685 | v = *data >> 4; | |
686 | } | |
687 | __bitmap_and(res_map, res_map, lt + v * f->bsize, | |
688 | f->bsize * BITS_PER_LONG); | |
689 | ||
690 | lt += f->bsize * NFT_PIPAPO_BUCKETS; | |
691 | } | |
692 | ||
693 | /* Now populate the bitmap for the next field, unless this is | |
694 | * the last field, in which case return the matched 'ext' | |
695 | * pointer if any. | |
696 | * | |
697 | * Now res_map contains the matching bitmap, and fill_map is the | |
698 | * bitmap for the next field. | |
699 | */ | |
700 | next_match: | |
701 | b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, | |
702 | last); | |
703 | if (b < 0) | |
704 | goto out; | |
705 | ||
706 | if (last) { | |
707 | if (nft_set_elem_expired(&f->mt[b].e->ext) || | |
708 | (genmask && | |
709 | !nft_set_elem_active(&f->mt[b].e->ext, genmask))) | |
710 | goto next_match; | |
711 | ||
712 | ret = f->mt[b].e; | |
713 | goto out; | |
714 | } | |
715 | ||
716 | data += NFT_PIPAPO_GROUPS_PADDING(f->groups); | |
717 | ||
718 | /* Swap bitmap indices: fill_map will be the initial bitmap for | |
719 | * the next field (i.e. the new res_map), and res_map is | |
720 | * guaranteed to be all-zeroes at this point, ready to be filled | |
721 | * according to the next mapping table. | |
722 | */ | |
723 | swap(res_map, fill_map); | |
724 | } | |
725 | ||
726 | out: | |
727 | kfree(fill_map); | |
728 | kfree(res_map); | |
729 | return ret; | |
730 | } | |
731 | ||
732 | /** | |
733 | * nft_pipapo_get() - Get matching element reference given key data | |
734 | * @net: Network namespace | |
735 | * @set: nftables API set representation | |
736 | * @elem: nftables API element representation containing key data | |
737 | * @flags: Unused | |
738 | */ | |
739 | void *nft_pipapo_get(const struct net *net, const struct nft_set *set, | |
740 | const struct nft_set_elem *elem, unsigned int flags) | |
741 | { | |
742 | return pipapo_get(net, set, (const u8 *)elem->key.val.data, | |
743 | nft_genmask_cur(net)); | |
744 | } | |
745 | ||
746 | /** | |
747 | * pipapo_resize() - Resize lookup or mapping table, or both | |
748 | * @f: Field containing lookup and mapping tables | |
749 | * @old_rules: Previous amount of rules in field | |
750 | * @rules: New amount of rules | |
751 | * | |
752 | * Increase, decrease or maintain tables size depending on new amount of rules, | |
753 | * and copy data over. In case the new size is smaller, throw away data for | |
754 | * highest-numbered rules. | |
755 | * | |
756 | * Return: 0 on success, -ENOMEM on allocation failure. | |
757 | */ | |
758 | static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules) | |
759 | { | |
760 | long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p; | |
761 | union nft_pipapo_map_bucket *new_mt, *old_mt = f->mt; | |
762 | size_t new_bucket_size, copy; | |
763 | int group, bucket; | |
764 | ||
765 | new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG); | |
766 | ||
767 | if (new_bucket_size == f->bsize) | |
768 | goto mt; | |
769 | ||
770 | if (new_bucket_size > f->bsize) | |
771 | copy = f->bsize; | |
772 | else | |
773 | copy = new_bucket_size; | |
774 | ||
775 | new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS * new_bucket_size * | |
776 | sizeof(*new_lt), GFP_KERNEL); | |
777 | if (!new_lt) | |
778 | return -ENOMEM; | |
779 | ||
780 | new_p = new_lt; | |
781 | old_p = old_lt; | |
782 | for (group = 0; group < f->groups; group++) { | |
783 | for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS; bucket++) { | |
784 | memcpy(new_p, old_p, copy * sizeof(*new_p)); | |
785 | new_p += copy; | |
786 | old_p += copy; | |
787 | ||
788 | if (new_bucket_size > f->bsize) | |
789 | new_p += new_bucket_size - f->bsize; | |
790 | else | |
791 | old_p += f->bsize - new_bucket_size; | |
792 | } | |
793 | } | |
794 | ||
795 | mt: | |
796 | new_mt = kvmalloc(rules * sizeof(*new_mt), GFP_KERNEL); | |
797 | if (!new_mt) { | |
798 | kvfree(new_lt); | |
799 | return -ENOMEM; | |
800 | } | |
801 | ||
802 | memcpy(new_mt, f->mt, min(old_rules, rules) * sizeof(*new_mt)); | |
803 | if (rules > old_rules) { | |
804 | memset(new_mt + old_rules, 0, | |
805 | (rules - old_rules) * sizeof(*new_mt)); | |
806 | } | |
807 | ||
808 | if (new_lt) { | |
809 | f->bsize = new_bucket_size; | |
810 | f->lt = new_lt; | |
811 | kvfree(old_lt); | |
812 | } | |
813 | ||
814 | f->mt = new_mt; | |
815 | kvfree(old_mt); | |
816 | ||
817 | return 0; | |
818 | } | |
819 | ||
820 | /** | |
821 | * pipapo_bucket_set() - Set rule bit in bucket given group and group value | |
822 | * @f: Field containing lookup table | |
823 | * @rule: Rule index | |
824 | * @group: Group index | |
825 | * @v: Value of bit group | |
826 | */ | |
827 | static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group, | |
828 | int v) | |
829 | { | |
830 | unsigned long *pos; | |
831 | ||
832 | pos = f->lt + f->bsize * NFT_PIPAPO_BUCKETS * group; | |
833 | pos += f->bsize * v; | |
834 | ||
835 | __set_bit(rule, pos); | |
836 | } | |
837 | ||
838 | /** | |
839 | * pipapo_insert() - Insert new rule in field given input key and mask length | |
840 | * @f: Field containing lookup table | |
841 | * @k: Input key for classification, without nftables padding | |
842 | * @mask_bits: Length of mask; matches field length for non-ranged entry | |
843 | * | |
844 | * Insert a new rule reference in lookup buckets corresponding to k and | |
845 | * mask_bits. | |
846 | * | |
847 | * Return: 1 on success (one rule inserted), negative error code on failure. | |
848 | */ | |
849 | static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k, | |
850 | int mask_bits) | |
851 | { | |
852 | int rule = f->rules++, group, ret; | |
853 | ||
854 | ret = pipapo_resize(f, f->rules - 1, f->rules); | |
855 | if (ret) | |
856 | return ret; | |
857 | ||
858 | for (group = 0; group < f->groups; group++) { | |
859 | int i, v; | |
860 | u8 mask; | |
861 | ||
862 | if (group % 2) | |
863 | v = k[group / 2] & 0x0f; | |
864 | else | |
865 | v = k[group / 2] >> 4; | |
866 | ||
867 | if (mask_bits >= (group + 1) * 4) { | |
868 | /* Not masked */ | |
869 | pipapo_bucket_set(f, rule, group, v); | |
870 | } else if (mask_bits <= group * 4) { | |
871 | /* Completely masked */ | |
872 | for (i = 0; i < NFT_PIPAPO_BUCKETS; i++) | |
873 | pipapo_bucket_set(f, rule, group, i); | |
874 | } else { | |
875 | /* The mask limit falls on this group */ | |
876 | mask = 0x0f >> (mask_bits - group * 4); | |
877 | for (i = 0; i < NFT_PIPAPO_BUCKETS; i++) { | |
878 | if ((i & ~mask) == (v & ~mask)) | |
879 | pipapo_bucket_set(f, rule, group, i); | |
880 | } | |
881 | } | |
882 | } | |
883 | ||
884 | return 1; | |
885 | } | |
886 | ||
887 | /** | |
888 | * pipapo_step_diff() - Check if setting @step bit in netmask would change it | |
889 | * @base: Mask we are expanding | |
890 | * @step: Step bit for given expansion step | |
891 | * @len: Total length of mask space (set and unset bits), bytes | |
892 | * | |
893 | * Convenience function for mask expansion. | |
894 | * | |
895 | * Return: true if step bit changes mask (i.e. isn't set), false otherwise. | |
896 | */ | |
897 | static bool pipapo_step_diff(u8 *base, int step, int len) | |
898 | { | |
899 | /* Network order, byte-addressed */ | |
900 | #ifdef __BIG_ENDIAN__ | |
901 | return !(BIT(step % BITS_PER_BYTE) & base[step / BITS_PER_BYTE]); | |
902 | #else | |
903 | return !(BIT(step % BITS_PER_BYTE) & | |
904 | base[len - 1 - step / BITS_PER_BYTE]); | |
905 | #endif | |
906 | } | |
907 | ||
908 | /** | |
909 | * pipapo_step_after_end() - Check if mask exceeds range end with given step | |
910 | * @base: Mask we are expanding | |
911 | * @end: End of range | |
912 | * @step: Step bit for given expansion step, highest bit to be set | |
913 | * @len: Total length of mask space (set and unset bits), bytes | |
914 | * | |
915 | * Convenience function for mask expansion. | |
916 | * | |
917 | * Return: true if mask exceeds range setting step bits, false otherwise. | |
918 | */ | |
919 | static bool pipapo_step_after_end(const u8 *base, const u8 *end, int step, | |
920 | int len) | |
921 | { | |
922 | u8 tmp[NFT_PIPAPO_MAX_BYTES]; | |
923 | int i; | |
924 | ||
925 | memcpy(tmp, base, len); | |
926 | ||
927 | /* Network order, byte-addressed */ | |
928 | for (i = 0; i <= step; i++) | |
929 | #ifdef __BIG_ENDIAN__ | |
930 | tmp[i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); | |
931 | #else | |
932 | tmp[len - 1 - i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); | |
933 | #endif | |
934 | ||
935 | return memcmp(tmp, end, len) > 0; | |
936 | } | |
937 | ||
938 | /** | |
939 | * pipapo_base_sum() - Sum step bit to given len-sized netmask base with carry | |
940 | * @base: Netmask base | |
941 | * @step: Step bit to sum | |
942 | * @len: Netmask length, bytes | |
943 | */ | |
944 | static void pipapo_base_sum(u8 *base, int step, int len) | |
945 | { | |
946 | bool carry = false; | |
947 | int i; | |
948 | ||
949 | /* Network order, byte-addressed */ | |
950 | #ifdef __BIG_ENDIAN__ | |
951 | for (i = step / BITS_PER_BYTE; i < len; i++) { | |
952 | #else | |
953 | for (i = len - 1 - step / BITS_PER_BYTE; i >= 0; i--) { | |
954 | #endif | |
955 | if (carry) | |
956 | base[i]++; | |
957 | else | |
958 | base[i] += 1 << (step % BITS_PER_BYTE); | |
959 | ||
960 | if (base[i]) | |
961 | break; | |
962 | ||
963 | carry = true; | |
964 | } | |
965 | } | |
966 | ||
967 | /** | |
968 | * pipapo_expand() - Expand to composing netmasks, insert into lookup table | |
969 | * @f: Field containing lookup table | |
970 | * @start: Start of range | |
971 | * @end: End of range | |
972 | * @len: Length of value in bits | |
973 | * | |
974 | * Expand range to composing netmasks and insert corresponding rule references | |
975 | * in lookup buckets. | |
976 | * | |
977 | * Return: number of inserted rules on success, negative error code on failure. | |
978 | */ | |
979 | static int pipapo_expand(struct nft_pipapo_field *f, | |
980 | const u8 *start, const u8 *end, int len) | |
981 | { | |
982 | int step, masks = 0, bytes = DIV_ROUND_UP(len, BITS_PER_BYTE); | |
983 | u8 base[NFT_PIPAPO_MAX_BYTES]; | |
984 | ||
985 | memcpy(base, start, bytes); | |
986 | while (memcmp(base, end, bytes) <= 0) { | |
987 | int err; | |
988 | ||
989 | step = 0; | |
990 | while (pipapo_step_diff(base, step, bytes)) { | |
991 | if (pipapo_step_after_end(base, end, step, bytes)) | |
992 | break; | |
993 | ||
994 | step++; | |
995 | if (step >= len) { | |
996 | if (!masks) { | |
997 | pipapo_insert(f, base, 0); | |
998 | masks = 1; | |
999 | } | |
1000 | goto out; | |
1001 | } | |
1002 | } | |
1003 | ||
1004 | err = pipapo_insert(f, base, len - step); | |
1005 | ||
1006 | if (err < 0) | |
1007 | return err; | |
1008 | ||
1009 | masks++; | |
1010 | pipapo_base_sum(base, step, bytes); | |
1011 | } | |
1012 | out: | |
1013 | return masks; | |
1014 | } | |
1015 | ||
1016 | /** | |
1017 | * pipapo_map() - Insert rules in mapping tables, mapping them between fields | |
1018 | * @m: Matching data, including mapping table | |
1019 | * @map: Table of rule maps: array of first rule and amount of rules | |
1020 | * in next field a given rule maps to, for each field | |
1021 | * @ext: For last field, nft_set_ext pointer matching rules map to | |
1022 | */ | |
1023 | static void pipapo_map(struct nft_pipapo_match *m, | |
1024 | union nft_pipapo_map_bucket map[NFT_PIPAPO_MAX_FIELDS], | |
1025 | struct nft_pipapo_elem *e) | |
1026 | { | |
1027 | struct nft_pipapo_field *f; | |
1028 | int i, j; | |
1029 | ||
1030 | for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) { | |
1031 | for (j = 0; j < map[i].n; j++) { | |
1032 | f->mt[map[i].to + j].to = map[i + 1].to; | |
1033 | f->mt[map[i].to + j].n = map[i + 1].n; | |
1034 | } | |
1035 | } | |
1036 | ||
1037 | /* Last field: map to ext instead of mapping to next field */ | |
1038 | for (j = 0; j < map[i].n; j++) | |
1039 | f->mt[map[i].to + j].e = e; | |
1040 | } | |
1041 | ||
1042 | /** | |
1043 | * pipapo_realloc_scratch() - Reallocate scratch maps for partial match results | |
1044 | * @clone: Copy of matching data with pending insertions and deletions | |
1045 | * @bsize_max Maximum bucket size, scratch maps cover two buckets | |
1046 | * | |
1047 | * Return: 0 on success, -ENOMEM on failure. | |
1048 | */ | |
1049 | static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, | |
1050 | unsigned long bsize_max) | |
1051 | { | |
1052 | int i; | |
1053 | ||
1054 | for_each_possible_cpu(i) { | |
1055 | unsigned long *scratch; | |
1056 | ||
1057 | scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2, | |
1058 | GFP_KERNEL, cpu_to_node(i)); | |
1059 | if (!scratch) { | |
1060 | /* On failure, there's no need to undo previous | |
1061 | * allocations: this means that some scratch maps have | |
1062 | * a bigger allocated size now (this is only called on | |
1063 | * insertion), but the extra space won't be used by any | |
1064 | * CPU as new elements are not inserted and m->bsize_max | |
1065 | * is not updated. | |
1066 | */ | |
1067 | return -ENOMEM; | |
1068 | } | |
1069 | ||
1070 | kfree(*per_cpu_ptr(clone->scratch, i)); | |
1071 | ||
1072 | *per_cpu_ptr(clone->scratch, i) = scratch; | |
1073 | } | |
1074 | ||
1075 | return 0; | |
1076 | } | |
1077 | ||
1078 | /** | |
1079 | * nft_pipapo_insert() - Validate and insert ranged elements | |
1080 | * @net: Network namespace | |
1081 | * @set: nftables API set representation | |
1082 | * @elem: nftables API element representation containing key data | |
1083 | * @ext2: Filled with pointer to &struct nft_set_ext in inserted element | |
1084 | * | |
1085 | * Return: 0 on success, error pointer on failure. | |
1086 | */ | |
1087 | static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, | |
1088 | const struct nft_set_elem *elem, | |
1089 | struct nft_set_ext **ext2) | |
1090 | { | |
1091 | const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); | |
1092 | union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; | |
1093 | const u8 *start = (const u8 *)elem->key.val.data, *end; | |
1094 | struct nft_pipapo_elem *e = elem->priv, *dup; | |
1095 | struct nft_pipapo *priv = nft_set_priv(set); | |
1096 | struct nft_pipapo_match *m = priv->clone; | |
1097 | u8 genmask = nft_genmask_next(net); | |
1098 | struct nft_pipapo_field *f; | |
1099 | int i, bsize_max, err = 0; | |
1100 | ||
1101 | dup = pipapo_get(net, set, start, genmask); | |
1102 | if (PTR_ERR(dup) == -ENOENT) { | |
1103 | if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) { | |
1104 | end = (const u8 *)nft_set_ext_key_end(ext)->data; | |
1105 | dup = pipapo_get(net, set, end, nft_genmask_next(net)); | |
1106 | } else { | |
1107 | end = start; | |
1108 | } | |
1109 | } | |
1110 | ||
1111 | if (PTR_ERR(dup) != -ENOENT) { | |
1112 | if (IS_ERR(dup)) | |
1113 | return PTR_ERR(dup); | |
1114 | *ext2 = &dup->ext; | |
1115 | return -EEXIST; | |
1116 | } | |
1117 | ||
1118 | /* Validate */ | |
1119 | nft_pipapo_for_each_field(f, i, m) { | |
1120 | const u8 *start_p = start, *end_p = end; | |
1121 | ||
1122 | if (f->rules >= (unsigned long)NFT_PIPAPO_RULE0_MAX) | |
1123 | return -ENOSPC; | |
1124 | ||
1125 | if (memcmp(start_p, end_p, | |
1126 | f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) > 0) | |
1127 | return -EINVAL; | |
1128 | ||
1129 | start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); | |
1130 | end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); | |
1131 | } | |
1132 | ||
1133 | /* Insert */ | |
1134 | priv->dirty = true; | |
1135 | ||
1136 | bsize_max = m->bsize_max; | |
1137 | ||
1138 | nft_pipapo_for_each_field(f, i, m) { | |
1139 | int ret; | |
1140 | ||
1141 | rulemap[i].to = f->rules; | |
1142 | ||
1143 | ret = memcmp(start, end, | |
1144 | f->groups / NFT_PIPAPO_GROUPS_PER_BYTE); | |
1145 | if (!ret) { | |
1146 | ret = pipapo_insert(f, start, | |
1147 | f->groups * NFT_PIPAPO_GROUP_BITS); | |
1148 | } else { | |
1149 | ret = pipapo_expand(f, start, end, | |
1150 | f->groups * NFT_PIPAPO_GROUP_BITS); | |
1151 | } | |
1152 | ||
1153 | if (f->bsize > bsize_max) | |
1154 | bsize_max = f->bsize; | |
1155 | ||
1156 | rulemap[i].n = ret; | |
1157 | ||
1158 | start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); | |
1159 | end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); | |
1160 | } | |
1161 | ||
1162 | if (!*this_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) { | |
1163 | err = pipapo_realloc_scratch(m, bsize_max); | |
1164 | if (err) | |
1165 | return err; | |
1166 | ||
1167 | this_cpu_write(nft_pipapo_scratch_index, false); | |
1168 | ||
1169 | m->bsize_max = bsize_max; | |
1170 | } | |
1171 | ||
1172 | *ext2 = &e->ext; | |
1173 | ||
1174 | pipapo_map(m, rulemap, e); | |
1175 | ||
1176 | return 0; | |
1177 | } | |
1178 | ||
1179 | /** | |
1180 | * pipapo_clone() - Clone matching data to create new working copy | |
1181 | * @old: Existing matching data | |
1182 | * | |
1183 | * Return: copy of matching data passed as 'old', error pointer on failure | |
1184 | */ | |
1185 | static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) | |
1186 | { | |
1187 | struct nft_pipapo_field *dst, *src; | |
1188 | struct nft_pipapo_match *new; | |
1189 | int i; | |
1190 | ||
1191 | new = kmalloc(sizeof(*new) + sizeof(*dst) * old->field_count, | |
1192 | GFP_KERNEL); | |
1193 | if (!new) | |
1194 | return ERR_PTR(-ENOMEM); | |
1195 | ||
1196 | new->field_count = old->field_count; | |
1197 | new->bsize_max = old->bsize_max; | |
1198 | ||
1199 | new->scratch = alloc_percpu(*new->scratch); | |
1200 | if (!new->scratch) | |
1201 | goto out_scratch; | |
1202 | ||
1203 | rcu_head_init(&new->rcu); | |
1204 | ||
1205 | src = old->f; | |
1206 | dst = new->f; | |
1207 | ||
1208 | for (i = 0; i < old->field_count; i++) { | |
1209 | memcpy(dst, src, offsetof(struct nft_pipapo_field, lt)); | |
1210 | ||
1211 | dst->lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS * | |
1212 | src->bsize * sizeof(*dst->lt), | |
1213 | GFP_KERNEL); | |
1214 | if (!dst->lt) | |
1215 | goto out_lt; | |
1216 | ||
1217 | memcpy(dst->lt, src->lt, | |
1218 | src->bsize * sizeof(*dst->lt) * | |
1219 | src->groups * NFT_PIPAPO_BUCKETS); | |
1220 | ||
1221 | dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL); | |
1222 | if (!dst->mt) | |
1223 | goto out_mt; | |
1224 | ||
1225 | memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt)); | |
1226 | src++; | |
1227 | dst++; | |
1228 | } | |
1229 | ||
1230 | return new; | |
1231 | ||
1232 | out_mt: | |
1233 | kvfree(dst->lt); | |
1234 | out_lt: | |
1235 | for (dst--; i > 0; i--) { | |
1236 | kvfree(dst->mt); | |
1237 | kvfree(dst->lt); | |
1238 | dst--; | |
1239 | } | |
1240 | free_percpu(new->scratch); | |
1241 | out_scratch: | |
1242 | kfree(new); | |
1243 | ||
1244 | return ERR_PTR(-ENOMEM); | |
1245 | } | |
1246 | ||
1247 | /** | |
1248 | * pipapo_rules_same_key() - Get number of rules originated from the same entry | |
1249 | * @f: Field containing mapping table | |
1250 | * @first: Index of first rule in set of rules mapping to same entry | |
1251 | * | |
1252 | * Using the fact that all rules in a field that originated from the same entry | |
1253 | * will map to the same set of rules in the next field, or to the same element | |
1254 | * reference, return the cardinality of the set of rules that originated from | |
1255 | * the same entry as the rule with index @first, @first rule included. | |
1256 | * | |
1257 | * In pictures: | |
1258 | * rules | |
1259 | * field #0 0 1 2 3 4 | |
1260 | * map to: 0 1 2-4 2-4 5-9 | |
1261 | * . . ....... . ... | |
1262 | * | | | | \ \ | |
1263 | * | | | | \ \ | |
1264 | * | | | | \ \ | |
1265 | * ' ' ' ' ' \ | |
1266 | * in field #1 0 1 2 3 4 5 ... | |
1267 | * | |
1268 | * if this is called for rule 2 on field #0, it will return 3, as also rules 2 | |
1269 | * and 3 in field 0 map to the same set of rules (2, 3, 4) in the next field. | |
1270 | * | |
1271 | * For the last field in a set, we can rely on associated entries to map to the | |
1272 | * same element references. | |
1273 | * | |
1274 | * Return: Number of rules that originated from the same entry as @first. | |
1275 | */ | |
1276 | static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first) | |
1277 | { | |
1278 | struct nft_pipapo_elem *e = NULL; /* Keep gcc happy */ | |
1279 | int r; | |
1280 | ||
1281 | for (r = first; r < f->rules; r++) { | |
1282 | if (r != first && e != f->mt[r].e) | |
1283 | return r - first; | |
1284 | ||
1285 | e = f->mt[r].e; | |
1286 | } | |
1287 | ||
1288 | if (r != first) | |
1289 | return r - first; | |
1290 | ||
1291 | return 0; | |
1292 | } | |
1293 | ||
1294 | /** | |
1295 | * pipapo_unmap() - Remove rules from mapping tables, renumber remaining ones | |
1296 | * @mt: Mapping array | |
1297 | * @rules: Original amount of rules in mapping table | |
1298 | * @start: First rule index to be removed | |
1299 | * @n: Amount of rules to be removed | |
1300 | * @to_offset: First rule index, in next field, this group of rules maps to | |
1301 | * @is_last: If this is the last field, delete reference from mapping array | |
1302 | * | |
1303 | * This is used to unmap rules from the mapping table for a single field, | |
1304 | * maintaining consistency and compactness for the existing ones. | |
1305 | * | |
1306 | * In pictures: let's assume that we want to delete rules 2 and 3 from the | |
1307 | * following mapping array: | |
1308 | * | |
1309 | * rules | |
1310 | * 0 1 2 3 4 | |
1311 | * map to: 4-10 4-10 11-15 11-15 16-18 | |
1312 | * | |
1313 | * the result will be: | |
1314 | * | |
1315 | * rules | |
1316 | * 0 1 2 | |
1317 | * map to: 4-10 4-10 11-13 | |
1318 | * | |
1319 | * for fields before the last one. In case this is the mapping table for the | |
1320 | * last field in a set, and rules map to pointers to &struct nft_pipapo_elem: | |
1321 | * | |
1322 | * rules | |
1323 | * 0 1 2 3 4 | |
1324 | * element pointers: 0x42 0x42 0x33 0x33 0x44 | |
1325 | * | |
1326 | * the result will be: | |
1327 | * | |
1328 | * rules | |
1329 | * 0 1 2 | |
1330 | * element pointers: 0x42 0x42 0x44 | |
1331 | */ | |
1332 | static void pipapo_unmap(union nft_pipapo_map_bucket *mt, int rules, | |
1333 | int start, int n, int to_offset, bool is_last) | |
1334 | { | |
1335 | int i; | |
1336 | ||
1337 | memmove(mt + start, mt + start + n, (rules - start - n) * sizeof(*mt)); | |
1338 | memset(mt + rules - n, 0, n * sizeof(*mt)); | |
1339 | ||
1340 | if (is_last) | |
1341 | return; | |
1342 | ||
1343 | for (i = start; i < rules - n; i++) | |
1344 | mt[i].to -= to_offset; | |
1345 | } | |
1346 | ||
1347 | /** | |
1348 | * pipapo_drop() - Delete entry from lookup and mapping tables, given rule map | |
1349 | * @m: Matching data | |
1350 | * @rulemap Table of rule maps, arrays of first rule and amount of rules | |
1351 | * in next field a given entry maps to, for each field | |
1352 | * | |
1353 | * For each rule in lookup table buckets mapping to this set of rules, drop | |
1354 | * all bits set in lookup table mapping. In pictures, assuming we want to drop | |
1355 | * rules 0 and 1 from this lookup table: | |
1356 | * | |
1357 | * bucket | |
1358 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
1359 | * 0 0 1,2 | |
1360 | * 1 1,2 0 | |
1361 | * 2 0 1,2 | |
1362 | * 3 0 1,2 | |
1363 | * 4 0,1,2 | |
1364 | * 5 0 1 2 | |
1365 | * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | |
1366 | * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 | |
1367 | * | |
1368 | * rule 2 becomes rule 0, and the result will be: | |
1369 | * | |
1370 | * bucket | |
1371 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
1372 | * 0 0 | |
1373 | * 1 0 | |
1374 | * 2 0 | |
1375 | * 3 0 | |
1376 | * 4 0 | |
1377 | * 5 0 | |
1378 | * 6 0 | |
1379 | * 7 0 0 | |
1380 | * | |
1381 | * once this is done, call unmap() to drop all the corresponding rule references | |
1382 | * from mapping tables. | |
1383 | */ | |
1384 | static void pipapo_drop(struct nft_pipapo_match *m, | |
1385 | union nft_pipapo_map_bucket rulemap[]) | |
1386 | { | |
1387 | struct nft_pipapo_field *f; | |
1388 | int i; | |
1389 | ||
1390 | nft_pipapo_for_each_field(f, i, m) { | |
1391 | int g; | |
1392 | ||
1393 | for (g = 0; g < f->groups; g++) { | |
1394 | unsigned long *pos; | |
1395 | int b; | |
1396 | ||
1397 | pos = f->lt + g * NFT_PIPAPO_BUCKETS * f->bsize; | |
1398 | ||
1399 | for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) { | |
1400 | bitmap_cut(pos, pos, rulemap[i].to, | |
1401 | rulemap[i].n, | |
1402 | f->bsize * BITS_PER_LONG); | |
1403 | ||
1404 | pos += f->bsize; | |
1405 | } | |
1406 | } | |
1407 | ||
1408 | pipapo_unmap(f->mt, f->rules, rulemap[i].to, rulemap[i].n, | |
1409 | rulemap[i + 1].n, i == m->field_count - 1); | |
1410 | if (pipapo_resize(f, f->rules, f->rules - rulemap[i].n)) { | |
1411 | /* We can ignore this, a failure to shrink tables down | |
1412 | * doesn't make tables invalid. | |
1413 | */ | |
1414 | ; | |
1415 | } | |
1416 | f->rules -= rulemap[i].n; | |
1417 | } | |
1418 | } | |
1419 | ||
1420 | /** | |
1421 | * pipapo_gc() - Drop expired entries from set, destroy start and end elements | |
1422 | * @set: nftables API set representation | |
1423 | * @m: Matching data | |
1424 | */ | |
1425 | static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) | |
1426 | { | |
1427 | struct nft_pipapo *priv = nft_set_priv(set); | |
1428 | int rules_f0, first_rule = 0; | |
1429 | ||
1430 | while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { | |
1431 | union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; | |
1432 | struct nft_pipapo_field *f; | |
1433 | struct nft_pipapo_elem *e; | |
1434 | int i, start, rules_fx; | |
1435 | ||
1436 | start = first_rule; | |
1437 | rules_fx = rules_f0; | |
1438 | ||
1439 | nft_pipapo_for_each_field(f, i, m) { | |
1440 | rulemap[i].to = start; | |
1441 | rulemap[i].n = rules_fx; | |
1442 | ||
1443 | if (i < m->field_count - 1) { | |
1444 | rules_fx = f->mt[start].n; | |
1445 | start = f->mt[start].to; | |
1446 | } | |
1447 | } | |
1448 | ||
1449 | /* Pick the last field, and its last index */ | |
1450 | f--; | |
1451 | i--; | |
1452 | e = f->mt[rulemap[i].to].e; | |
1453 | if (nft_set_elem_expired(&e->ext) && | |
1454 | !nft_set_elem_mark_busy(&e->ext)) { | |
1455 | priv->dirty = true; | |
1456 | pipapo_drop(m, rulemap); | |
1457 | ||
1458 | rcu_barrier(); | |
1459 | nft_set_elem_destroy(set, e, true); | |
1460 | ||
1461 | /* And check again current first rule, which is now the | |
1462 | * first we haven't checked. | |
1463 | */ | |
1464 | } else { | |
1465 | first_rule += rules_f0; | |
1466 | } | |
1467 | } | |
1468 | ||
1469 | priv->last_gc = jiffies; | |
1470 | } | |
1471 | ||
1472 | /** | |
1473 | * pipapo_free_fields() - Free per-field tables contained in matching data | |
1474 | * @m: Matching data | |
1475 | */ | |
1476 | static void pipapo_free_fields(struct nft_pipapo_match *m) | |
1477 | { | |
1478 | struct nft_pipapo_field *f; | |
1479 | int i; | |
1480 | ||
1481 | nft_pipapo_for_each_field(f, i, m) { | |
1482 | kvfree(f->lt); | |
1483 | kvfree(f->mt); | |
1484 | } | |
1485 | } | |
1486 | ||
1487 | /** | |
1488 | * pipapo_reclaim_match - RCU callback to free fields from old matching data | |
1489 | * @rcu: RCU head | |
1490 | */ | |
1491 | static void pipapo_reclaim_match(struct rcu_head *rcu) | |
1492 | { | |
1493 | struct nft_pipapo_match *m; | |
1494 | int i; | |
1495 | ||
1496 | m = container_of(rcu, struct nft_pipapo_match, rcu); | |
1497 | ||
1498 | for_each_possible_cpu(i) | |
1499 | kfree(*per_cpu_ptr(m->scratch, i)); | |
1500 | ||
1501 | free_percpu(m->scratch); | |
1502 | ||
1503 | pipapo_free_fields(m); | |
1504 | ||
1505 | kfree(m); | |
1506 | } | |
1507 | ||
1508 | /** | |
1509 | * pipapo_commit() - Replace lookup data with current working copy | |
1510 | * @set: nftables API set representation | |
1511 | * | |
1512 | * While at it, check if we should perform garbage collection on the working | |
1513 | * copy before committing it for lookup, and don't replace the table if the | |
1514 | * working copy doesn't have pending changes. | |
1515 | * | |
1516 | * We also need to create a new working copy for subsequent insertions and | |
1517 | * deletions. | |
1518 | */ | |
1519 | static void pipapo_commit(const struct nft_set *set) | |
1520 | { | |
1521 | struct nft_pipapo *priv = nft_set_priv(set); | |
1522 | struct nft_pipapo_match *new_clone, *old; | |
1523 | ||
1524 | if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) | |
1525 | pipapo_gc(set, priv->clone); | |
1526 | ||
1527 | if (!priv->dirty) | |
1528 | return; | |
1529 | ||
1530 | new_clone = pipapo_clone(priv->clone); | |
1531 | if (IS_ERR(new_clone)) | |
1532 | return; | |
1533 | ||
1534 | priv->dirty = false; | |
1535 | ||
1536 | old = rcu_access_pointer(priv->match); | |
1537 | rcu_assign_pointer(priv->match, priv->clone); | |
1538 | if (old) | |
1539 | call_rcu(&old->rcu, pipapo_reclaim_match); | |
1540 | ||
1541 | priv->clone = new_clone; | |
1542 | } | |
1543 | ||
1544 | /** | |
1545 | * nft_pipapo_activate() - Mark element reference as active given key, commit | |
1546 | * @net: Network namespace | |
1547 | * @set: nftables API set representation | |
1548 | * @elem: nftables API element representation containing key data | |
1549 | * | |
1550 | * On insertion, elements are added to a copy of the matching data currently | |
1551 | * in use for lookups, and not directly inserted into current lookup data, so | |
1552 | * we'll take care of that by calling pipapo_commit() here. Both | |
1553 | * nft_pipapo_insert() and nft_pipapo_activate() are called once for each | |
1554 | * element, hence we can't purpose either one as a real commit operation. | |
1555 | */ | |
1556 | static void nft_pipapo_activate(const struct net *net, | |
1557 | const struct nft_set *set, | |
1558 | const struct nft_set_elem *elem) | |
1559 | { | |
1560 | struct nft_pipapo_elem *e; | |
1561 | ||
1562 | e = pipapo_get(net, set, (const u8 *)elem->key.val.data, 0); | |
1563 | if (IS_ERR(e)) | |
1564 | return; | |
1565 | ||
1566 | nft_set_elem_change_active(net, set, &e->ext); | |
1567 | nft_set_elem_clear_busy(&e->ext); | |
1568 | ||
1569 | pipapo_commit(set); | |
1570 | } | |
1571 | ||
1572 | /** | |
1573 | * pipapo_deactivate() - Check that element is in set, mark as inactive | |
1574 | * @net: Network namespace | |
1575 | * @set: nftables API set representation | |
1576 | * @data: Input key data | |
1577 | * @ext: nftables API extension pointer, used to check for end element | |
1578 | * | |
1579 | * This is a convenience function that can be called from both | |
1580 | * nft_pipapo_deactivate() and nft_pipapo_flush(), as they are in fact the same | |
1581 | * operation. | |
1582 | * | |
1583 | * Return: deactivated element if found, NULL otherwise. | |
1584 | */ | |
1585 | static void *pipapo_deactivate(const struct net *net, const struct nft_set *set, | |
1586 | const u8 *data, const struct nft_set_ext *ext) | |
1587 | { | |
1588 | struct nft_pipapo_elem *e; | |
1589 | ||
1590 | e = pipapo_get(net, set, data, nft_genmask_next(net)); | |
1591 | if (IS_ERR(e)) | |
1592 | return NULL; | |
1593 | ||
1594 | nft_set_elem_change_active(net, set, &e->ext); | |
1595 | ||
1596 | return e; | |
1597 | } | |
1598 | ||
1599 | /** | |
1600 | * nft_pipapo_deactivate() - Call pipapo_deactivate() to make element inactive | |
1601 | * @net: Network namespace | |
1602 | * @set: nftables API set representation | |
1603 | * @elem: nftables API element representation containing key data | |
1604 | * | |
1605 | * Return: deactivated element if found, NULL otherwise. | |
1606 | */ | |
1607 | static void *nft_pipapo_deactivate(const struct net *net, | |
1608 | const struct nft_set *set, | |
1609 | const struct nft_set_elem *elem) | |
1610 | { | |
1611 | const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); | |
1612 | ||
1613 | return pipapo_deactivate(net, set, (const u8 *)elem->key.val.data, ext); | |
1614 | } | |
1615 | ||
1616 | /** | |
1617 | * nft_pipapo_flush() - Call pipapo_deactivate() to make element inactive | |
1618 | * @net: Network namespace | |
1619 | * @set: nftables API set representation | |
1620 | * @elem: nftables API element representation containing key data | |
1621 | * | |
1622 | * This is functionally the same as nft_pipapo_deactivate(), with a slightly | |
1623 | * different interface, and it's also called once for each element in a set | |
1624 | * being flushed, so we can't implement, strictly speaking, a flush operation, | |
1625 | * which would otherwise be as simple as allocating an empty copy of the | |
1626 | * matching data. | |
1627 | * | |
1628 | * Note that we could in theory do that, mark the set as flushed, and ignore | |
1629 | * subsequent calls, but we would leak all the elements after the first one, | |
1630 | * because they wouldn't then be freed as result of API calls. | |
1631 | * | |
1632 | * Return: true if element was found and deactivated. | |
1633 | */ | |
1634 | static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set, | |
1635 | void *elem) | |
1636 | { | |
1637 | struct nft_pipapo_elem *e = elem; | |
1638 | ||
1639 | return pipapo_deactivate(net, set, (const u8 *)nft_set_ext_key(&e->ext), | |
1640 | &e->ext); | |
1641 | } | |
1642 | ||
1643 | /** | |
1644 | * pipapo_get_boundaries() - Get byte interval for associated rules | |
1645 | * @f: Field including lookup table | |
1646 | * @first_rule: First rule (lowest index) | |
1647 | * @rule_count: Number of associated rules | |
1648 | * @left: Byte expression for left boundary (start of range) | |
1649 | * @right: Byte expression for right boundary (end of range) | |
1650 | * | |
1651 | * Given the first rule and amount of rules that originated from the same entry, | |
1652 | * build the original range associated with the entry, and calculate the length | |
1653 | * of the originating netmask. | |
1654 | * | |
1655 | * In pictures: | |
1656 | * | |
1657 | * bucket | |
1658 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
1659 | * 0 1,2 | |
1660 | * 1 1,2 | |
1661 | * 2 1,2 | |
1662 | * 3 1,2 | |
1663 | * 4 1,2 | |
1664 | * 5 1 2 | |
1665 | * 6 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | |
1666 | * 7 1,2 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | |
1667 | * | |
1668 | * this is the lookup table corresponding to the IPv4 range | |
1669 | * 192.168.1.0-192.168.2.1, which was expanded to the two composing netmasks, | |
1670 | * rule #1: 192.168.1.0/24, and rule #2: 192.168.2.0/31. | |
1671 | * | |
1672 | * This function fills @left and @right with the byte values of the leftmost | |
1673 | * and rightmost bucket indices for the lowest and highest rule indices, | |
1674 | * respectively. If @first_rule is 1 and @rule_count is 2, we obtain, in | |
1675 | * nibbles: | |
1676 | * left: < 12, 0, 10, 8, 0, 1, 0, 0 > | |
1677 | * right: < 12, 0, 10, 8, 0, 2, 2, 1 > | |
1678 | * corresponding to bytes: | |
1679 | * left: < 192, 168, 1, 0 > | |
1680 | * right: < 192, 168, 2, 1 > | |
1681 | * with mask length irrelevant here, unused on return, as the range is already | |
1682 | * defined by its start and end points. The mask length is relevant for a single | |
1683 | * ranged entry instead: if @first_rule is 1 and @rule_count is 1, we ignore | |
1684 | * rule 2 above: @left becomes < 192, 168, 1, 0 >, @right becomes | |
1685 | * < 192, 168, 1, 255 >, and the mask length, calculated from the distances | |
1686 | * between leftmost and rightmost bucket indices for each group, would be 24. | |
1687 | * | |
1688 | * Return: mask length, in bits. | |
1689 | */ | |
1690 | static int pipapo_get_boundaries(struct nft_pipapo_field *f, int first_rule, | |
1691 | int rule_count, u8 *left, u8 *right) | |
1692 | { | |
1693 | u8 *l = left, *r = right; | |
1694 | int g, mask_len = 0; | |
1695 | ||
1696 | for (g = 0; g < f->groups; g++) { | |
1697 | int b, x0, x1; | |
1698 | ||
1699 | x0 = -1; | |
1700 | x1 = -1; | |
1701 | for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) { | |
1702 | unsigned long *pos; | |
1703 | ||
1704 | pos = f->lt + (g * NFT_PIPAPO_BUCKETS + b) * f->bsize; | |
1705 | if (test_bit(first_rule, pos) && x0 == -1) | |
1706 | x0 = b; | |
1707 | if (test_bit(first_rule + rule_count - 1, pos)) | |
1708 | x1 = b; | |
1709 | } | |
1710 | ||
1711 | if (g % 2) { | |
1712 | *(l++) |= x0 & 0x0f; | |
1713 | *(r++) |= x1 & 0x0f; | |
1714 | } else { | |
1715 | *l |= x0 << 4; | |
1716 | *r |= x1 << 4; | |
1717 | } | |
1718 | ||
1719 | if (x1 - x0 == 0) | |
1720 | mask_len += 4; | |
1721 | else if (x1 - x0 == 1) | |
1722 | mask_len += 3; | |
1723 | else if (x1 - x0 == 3) | |
1724 | mask_len += 2; | |
1725 | else if (x1 - x0 == 7) | |
1726 | mask_len += 1; | |
1727 | } | |
1728 | ||
1729 | return mask_len; | |
1730 | } | |
1731 | ||
1732 | /** | |
1733 | * pipapo_match_field() - Match rules against byte ranges | |
1734 | * @f: Field including the lookup table | |
1735 | * @first_rule: First of associated rules originating from same entry | |
1736 | * @rule_count: Amount of associated rules | |
1737 | * @start: Start of range to be matched | |
1738 | * @end: End of range to be matched | |
1739 | * | |
1740 | * Return: true on match, false otherwise. | |
1741 | */ | |
1742 | static bool pipapo_match_field(struct nft_pipapo_field *f, | |
1743 | int first_rule, int rule_count, | |
1744 | const u8 *start, const u8 *end) | |
1745 | { | |
1746 | u8 right[NFT_PIPAPO_MAX_BYTES] = { 0 }; | |
1747 | u8 left[NFT_PIPAPO_MAX_BYTES] = { 0 }; | |
1748 | ||
1749 | pipapo_get_boundaries(f, first_rule, rule_count, left, right); | |
1750 | ||
1751 | return !memcmp(start, left, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) && | |
1752 | !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE); | |
1753 | } | |
1754 | ||
1755 | /** | |
1756 | * nft_pipapo_remove() - Remove element given key, commit | |
1757 | * @net: Network namespace | |
1758 | * @set: nftables API set representation | |
1759 | * @elem: nftables API element representation containing key data | |
1760 | * | |
1761 | * Similarly to nft_pipapo_activate(), this is used as commit operation by the | |
1762 | * API, but it's called once per element in the pending transaction, so we can't | |
1763 | * implement this as a single commit operation. Closest we can get is to remove | |
1764 | * the matched element here, if any, and commit the updated matching data. | |
1765 | */ | |
1766 | static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, | |
1767 | const struct nft_set_elem *elem) | |
1768 | { | |
1769 | const u8 *data = (const u8 *)elem->key.val.data; | |
1770 | struct nft_pipapo *priv = nft_set_priv(set); | |
1771 | struct nft_pipapo_match *m = priv->clone; | |
1772 | int rules_f0, first_rule = 0; | |
1773 | struct nft_pipapo_elem *e; | |
1774 | ||
1775 | e = pipapo_get(net, set, data, 0); | |
1776 | if (IS_ERR(e)) | |
1777 | return; | |
1778 | ||
1779 | while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { | |
1780 | union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; | |
1781 | const u8 *match_start, *match_end; | |
1782 | struct nft_pipapo_field *f; | |
1783 | int i, start, rules_fx; | |
1784 | ||
1785 | match_start = data; | |
1786 | match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data; | |
1787 | ||
1788 | start = first_rule; | |
1789 | rules_fx = rules_f0; | |
1790 | ||
1791 | nft_pipapo_for_each_field(f, i, m) { | |
1792 | if (!pipapo_match_field(f, start, rules_fx, | |
1793 | match_start, match_end)) | |
1794 | break; | |
1795 | ||
1796 | rulemap[i].to = start; | |
1797 | rulemap[i].n = rules_fx; | |
1798 | ||
1799 | rules_fx = f->mt[start].n; | |
1800 | start = f->mt[start].to; | |
1801 | ||
1802 | match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); | |
1803 | match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); | |
1804 | } | |
1805 | ||
1806 | if (i == m->field_count) { | |
1807 | priv->dirty = true; | |
1808 | pipapo_drop(m, rulemap); | |
1809 | pipapo_commit(set); | |
1810 | return; | |
1811 | } | |
1812 | ||
1813 | first_rule += rules_f0; | |
1814 | } | |
1815 | } | |
1816 | ||
1817 | /** | |
1818 | * nft_pipapo_walk() - Walk over elements | |
1819 | * @ctx: nftables API context | |
1820 | * @set: nftables API set representation | |
1821 | * @iter: Iterator | |
1822 | * | |
1823 | * As elements are referenced in the mapping array for the last field, directly | |
1824 | * scan that array: there's no need to follow rule mappings from the first | |
1825 | * field. | |
1826 | */ | |
1827 | static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, | |
1828 | struct nft_set_iter *iter) | |
1829 | { | |
1830 | struct nft_pipapo *priv = nft_set_priv(set); | |
1831 | struct nft_pipapo_match *m; | |
1832 | struct nft_pipapo_field *f; | |
1833 | int i, r; | |
1834 | ||
1835 | rcu_read_lock(); | |
1836 | m = rcu_dereference(priv->match); | |
1837 | ||
1838 | if (unlikely(!m)) | |
1839 | goto out; | |
1840 | ||
1841 | for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) | |
1842 | ; | |
1843 | ||
1844 | for (r = 0; r < f->rules; r++) { | |
1845 | struct nft_pipapo_elem *e; | |
1846 | struct nft_set_elem elem; | |
1847 | ||
1848 | if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) | |
1849 | continue; | |
1850 | ||
1851 | if (iter->count < iter->skip) | |
1852 | goto cont; | |
1853 | ||
1854 | e = f->mt[r].e; | |
1855 | if (nft_set_elem_expired(&e->ext)) | |
1856 | goto cont; | |
1857 | ||
1858 | elem.priv = e; | |
1859 | ||
1860 | iter->err = iter->fn(ctx, set, iter, &elem); | |
1861 | if (iter->err < 0) | |
1862 | goto out; | |
1863 | ||
1864 | cont: | |
1865 | iter->count++; | |
1866 | } | |
1867 | ||
1868 | out: | |
1869 | rcu_read_unlock(); | |
1870 | } | |
1871 | ||
1872 | /** | |
1873 | * nft_pipapo_privsize() - Return the size of private data for the set | |
1874 | * @nla: netlink attributes, ignored as size doesn't depend on them | |
1875 | * @desc: Set description, ignored as size doesn't depend on it | |
1876 | * | |
1877 | * Return: size of private data for this set implementation, in bytes | |
1878 | */ | |
1879 | static u64 nft_pipapo_privsize(const struct nlattr * const nla[], | |
1880 | const struct nft_set_desc *desc) | |
1881 | { | |
1882 | return sizeof(struct nft_pipapo); | |
1883 | } | |
1884 | ||
1885 | /** | |
1886 | * nft_pipapo_estimate() - Estimate set size, space and lookup complexity | |
1887 | * @desc: Set description, element count and field description used here | |
1888 | * @features: Flags: NFT_SET_INTERVAL needs to be there | |
1889 | * @est: Storage for estimation data | |
1890 | * | |
1891 | * The size for this set type can vary dramatically, as it depends on the number | |
1892 | * of rules (composing netmasks) the entries expand to. We compute the worst | |
1893 | * case here. | |
1894 | * | |
1895 | * In general, for a non-ranged entry or a single composing netmask, we need | |
1896 | * one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that | |
1897 | * is, each input bit needs four bits of matching data), plus a bucket in the | |
1898 | * mapping table for each field. | |
1899 | * | |
1900 | * Return: true only for compatible range concatenations | |
1901 | */ | |
1902 | static bool nft_pipapo_estimate(const struct nft_set_desc *desc, u32 features, | |
1903 | struct nft_set_estimate *est) | |
1904 | { | |
1905 | unsigned long entry_size; | |
1906 | int i; | |
1907 | ||
1908 | if (!(features & NFT_SET_INTERVAL) || desc->field_count <= 1) | |
1909 | return false; | |
1910 | ||
1911 | for (i = 0, entry_size = 0; i < desc->field_count; i++) { | |
1912 | unsigned long rules; | |
1913 | ||
1914 | if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES) | |
1915 | return false; | |
1916 | ||
1917 | /* Worst-case ranges for each concatenated field: each n-bit | |
1918 | * field can expand to up to n * 2 rules in each bucket, and | |
1919 | * each rule also needs a mapping bucket. | |
1920 | */ | |
1921 | rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2; | |
1922 | entry_size += rules * NFT_PIPAPO_BUCKETS / BITS_PER_BYTE; | |
1923 | entry_size += rules * sizeof(union nft_pipapo_map_bucket); | |
1924 | } | |
1925 | ||
1926 | /* Rules in lookup and mapping tables are needed for each entry */ | |
1927 | est->size = desc->size * entry_size; | |
1928 | if (est->size && div_u64(est->size, desc->size) != entry_size) | |
1929 | return false; | |
1930 | ||
1931 | est->size += sizeof(struct nft_pipapo) + | |
1932 | sizeof(struct nft_pipapo_match) * 2; | |
1933 | ||
1934 | est->size += sizeof(struct nft_pipapo_field) * desc->field_count; | |
1935 | ||
1936 | est->lookup = NFT_SET_CLASS_O_LOG_N; | |
1937 | ||
1938 | est->space = NFT_SET_CLASS_O_N; | |
1939 | ||
1940 | return true; | |
1941 | } | |
1942 | ||
1943 | /** | |
1944 | * nft_pipapo_init() - Initialise data for a set instance | |
1945 | * @set: nftables API set representation | |
1946 | * @desc: Set description | |
1947 | * @nla: netlink attributes | |
1948 | * | |
1949 | * Validate number and size of fields passed as NFTA_SET_DESC_CONCAT netlink | |
1950 | * attributes, initialise internal set parameters, current instance of matching | |
1951 | * data and a copy for subsequent insertions. | |
1952 | * | |
1953 | * Return: 0 on success, negative error code on failure. | |
1954 | */ | |
1955 | static int nft_pipapo_init(const struct nft_set *set, | |
1956 | const struct nft_set_desc *desc, | |
1957 | const struct nlattr * const nla[]) | |
1958 | { | |
1959 | struct nft_pipapo *priv = nft_set_priv(set); | |
1960 | struct nft_pipapo_match *m; | |
1961 | struct nft_pipapo_field *f; | |
1962 | int err, i; | |
1963 | ||
1964 | if (desc->field_count > NFT_PIPAPO_MAX_FIELDS) | |
1965 | return -EINVAL; | |
1966 | ||
1967 | m = kmalloc(sizeof(*priv->match) + sizeof(*f) * desc->field_count, | |
1968 | GFP_KERNEL); | |
1969 | if (!m) | |
1970 | return -ENOMEM; | |
1971 | ||
1972 | m->field_count = desc->field_count; | |
1973 | m->bsize_max = 0; | |
1974 | ||
1975 | m->scratch = alloc_percpu(unsigned long *); | |
1976 | if (!m->scratch) { | |
1977 | err = -ENOMEM; | |
1978 | goto out_free; | |
1979 | } | |
1980 | for_each_possible_cpu(i) | |
1981 | *per_cpu_ptr(m->scratch, i) = NULL; | |
1982 | ||
1983 | rcu_head_init(&m->rcu); | |
1984 | ||
1985 | nft_pipapo_for_each_field(f, i, m) { | |
1986 | f->groups = desc->field_len[i] * NFT_PIPAPO_GROUPS_PER_BYTE; | |
1987 | priv->groups += f->groups; | |
1988 | ||
1989 | priv->width += round_up(desc->field_len[i], sizeof(u32)); | |
1990 | ||
1991 | f->bsize = 0; | |
1992 | f->rules = 0; | |
1993 | f->lt = NULL; | |
1994 | f->mt = NULL; | |
1995 | } | |
1996 | ||
1997 | /* Create an initial clone of matching data for next insertion */ | |
1998 | priv->clone = pipapo_clone(m); | |
1999 | if (IS_ERR(priv->clone)) { | |
2000 | err = PTR_ERR(priv->clone); | |
2001 | goto out_free; | |
2002 | } | |
2003 | ||
2004 | priv->dirty = false; | |
2005 | ||
2006 | rcu_assign_pointer(priv->match, m); | |
2007 | ||
2008 | return 0; | |
2009 | ||
2010 | out_free: | |
2011 | free_percpu(m->scratch); | |
2012 | kfree(m); | |
2013 | ||
2014 | return err; | |
2015 | } | |
2016 | ||
2017 | /** | |
2018 | * nft_pipapo_destroy() - Free private data for set and all committed elements | |
2019 | * @set: nftables API set representation | |
2020 | */ | |
2021 | static void nft_pipapo_destroy(const struct nft_set *set) | |
2022 | { | |
2023 | struct nft_pipapo *priv = nft_set_priv(set); | |
2024 | struct nft_pipapo_match *m; | |
2025 | struct nft_pipapo_field *f; | |
2026 | int i, r, cpu; | |
2027 | ||
2028 | m = rcu_dereference_protected(priv->match, true); | |
2029 | if (m) { | |
2030 | rcu_barrier(); | |
2031 | ||
2032 | for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) | |
2033 | ; | |
2034 | ||
2035 | for (r = 0; r < f->rules; r++) { | |
2036 | struct nft_pipapo_elem *e; | |
2037 | ||
2038 | if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) | |
2039 | continue; | |
2040 | ||
2041 | e = f->mt[r].e; | |
2042 | ||
2043 | nft_set_elem_destroy(set, e, true); | |
2044 | } | |
2045 | ||
2046 | for_each_possible_cpu(cpu) | |
2047 | kfree(*per_cpu_ptr(m->scratch, cpu)); | |
2048 | free_percpu(m->scratch); | |
2049 | ||
2050 | pipapo_free_fields(m); | |
2051 | kfree(m); | |
2052 | priv->match = NULL; | |
2053 | } | |
2054 | ||
2055 | if (priv->clone) { | |
2056 | for_each_possible_cpu(cpu) | |
2057 | kfree(*per_cpu_ptr(priv->clone->scratch, cpu)); | |
2058 | free_percpu(priv->clone->scratch); | |
2059 | ||
2060 | pipapo_free_fields(priv->clone); | |
2061 | kfree(priv->clone); | |
2062 | priv->clone = NULL; | |
2063 | } | |
2064 | } | |
2065 | ||
2066 | /** | |
2067 | * nft_pipapo_gc_init() - Initialise garbage collection | |
2068 | * @set: nftables API set representation | |
2069 | * | |
2070 | * Instead of actually setting up a periodic work for garbage collection, as | |
2071 | * this operation requires a swap of matching data with the working copy, we'll | |
2072 | * do that opportunistically with other commit operations if the interval is | |
2073 | * elapsed, so we just need to set the current jiffies timestamp here. | |
2074 | */ | |
2075 | static void nft_pipapo_gc_init(const struct nft_set *set) | |
2076 | { | |
2077 | struct nft_pipapo *priv = nft_set_priv(set); | |
2078 | ||
2079 | priv->last_gc = jiffies; | |
2080 | } | |
2081 | ||
2082 | struct nft_set_type nft_set_pipapo_type __read_mostly = { | |
2083 | .owner = THIS_MODULE, | |
2084 | .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | | |
2085 | NFT_SET_TIMEOUT, | |
2086 | .ops = { | |
2087 | .lookup = nft_pipapo_lookup, | |
2088 | .insert = nft_pipapo_insert, | |
2089 | .activate = nft_pipapo_activate, | |
2090 | .deactivate = nft_pipapo_deactivate, | |
2091 | .flush = nft_pipapo_flush, | |
2092 | .remove = nft_pipapo_remove, | |
2093 | .walk = nft_pipapo_walk, | |
2094 | .get = nft_pipapo_get, | |
2095 | .privsize = nft_pipapo_privsize, | |
2096 | .estimate = nft_pipapo_estimate, | |
2097 | .init = nft_pipapo_init, | |
2098 | .destroy = nft_pipapo_destroy, | |
2099 | .gc_init = nft_pipapo_gc_init, | |
2100 | .elemsize = offsetof(struct nft_pipapo_elem, ext), | |
2101 | }, | |
2102 | }; |