Merge tag 'hyperv-next-signed-20201214' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-block.git] / net / netfilter / nft_set_pipapo_avx2.c
CommitLineData
7400b063
SB
1// SPDX-License-Identifier: GPL-2.0-only
2
3/* PIPAPO: PIle PAcket POlicies: AVX2 packet lookup routines
4 *
5 * Copyright (c) 2019-2020 Red Hat GmbH
6 *
7 * Author: Stefano Brivio <sbrivio@redhat.com>
8 */
9
10#include <linux/kernel.h>
11#include <linux/init.h>
12#include <linux/module.h>
13#include <linux/netlink.h>
14#include <linux/netfilter.h>
15#include <linux/netfilter/nf_tables.h>
16#include <net/netfilter/nf_tables_core.h>
17#include <uapi/linux/netfilter/nf_tables.h>
18#include <linux/bitmap.h>
19#include <linux/bitops.h>
20
21#include <linux/compiler.h>
22#include <asm/fpu/api.h>
23
24#include "nft_set_pipapo_avx2.h"
25#include "nft_set_pipapo.h"
26
27#define NFT_PIPAPO_LONGS_PER_M256 (XSAVE_YMM_SIZE / BITS_PER_LONG)
28
29/* Load from memory into YMM register with non-temporal hint ("stream load"),
30 * that is, don't fetch lines from memory into the cache. This avoids pushing
31 * precious packet data out of the cache hierarchy, and is appropriate when:
32 *
33 * - loading buckets from lookup tables, as they are not going to be used
34 * again before packets are entirely classified
35 *
36 * - loading the result bitmap from the previous field, as it's never used
37 * again
38 */
39#define NFT_PIPAPO_AVX2_LOAD(reg, loc) \
40 asm volatile("vmovntdqa %0, %%ymm" #reg : : "m" (loc))
41
42/* Stream a single lookup table bucket into YMM register given lookup table,
43 * group index, value of packet bits, bucket size.
44 */
45#define NFT_PIPAPO_AVX2_BUCKET_LOAD4(reg, lt, group, v, bsize) \
46 NFT_PIPAPO_AVX2_LOAD(reg, \
47 lt[((group) * NFT_PIPAPO_BUCKETS(4) + \
48 (v)) * (bsize)])
49#define NFT_PIPAPO_AVX2_BUCKET_LOAD8(reg, lt, group, v, bsize) \
50 NFT_PIPAPO_AVX2_LOAD(reg, \
51 lt[((group) * NFT_PIPAPO_BUCKETS(8) + \
52 (v)) * (bsize)])
53
54/* Bitwise AND: the staple operation of this algorithm */
55#define NFT_PIPAPO_AVX2_AND(dst, a, b) \
56 asm volatile("vpand %ymm" #a ", %ymm" #b ", %ymm" #dst)
57
58/* Jump to label if @reg is zero */
59#define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \
60 asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \
61 "je %l[" #label "]" : : : : label)
62
63/* Store 256 bits from YMM register into memory. Contrary to bucket load
64 * operation, we don't bypass the cache here, as stored matching results
65 * are always used shortly after.
66 */
67#define NFT_PIPAPO_AVX2_STORE(loc, reg) \
68 asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (loc))
69
70/* Zero out a complete YMM register, @reg */
71#define NFT_PIPAPO_AVX2_ZERO(reg) \
72 asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg)
73
74/* Current working bitmap index, toggled between field matches */
75static DEFINE_PER_CPU(bool, nft_pipapo_avx2_scratch_index);
76
77/**
78 * nft_pipapo_avx2_prepare() - Prepare before main algorithm body
79 *
80 * This zeroes out ymm15, which is later used whenever we need to clear a
81 * memory location, by storing its content into memory.
82 */
83static void nft_pipapo_avx2_prepare(void)
84{
85 NFT_PIPAPO_AVX2_ZERO(15);
86}
87
88/**
89 * nft_pipapo_avx2_fill() - Fill a bitmap region with ones
90 * @data: Base memory area
91 * @start: First bit to set
92 * @len: Count of bits to fill
93 *
94 * This is nothing else than a version of bitmap_set(), as used e.g. by
95 * pipapo_refill(), tailored for the microarchitectures using it and better
96 * suited for the specific usage: it's very likely that we'll set a small number
97 * of bits, not crossing a word boundary, and correct branch prediction is
98 * critical here.
99 *
100 * This function doesn't actually use any AVX2 instruction.
101 */
102static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len)
103{
104 int offset = start % BITS_PER_LONG;
105 unsigned long mask;
106
107 data += start / BITS_PER_LONG;
108
109 if (likely(len == 1)) {
110 *data |= BIT(offset);
111 return;
112 }
113
114 if (likely(len < BITS_PER_LONG || offset)) {
115 if (likely(len + offset <= BITS_PER_LONG)) {
116 *data |= GENMASK(len - 1 + offset, offset);
117 return;
118 }
119
120 *data |= ~0UL << offset;
121 len -= BITS_PER_LONG - offset;
122 data++;
123
124 if (len <= BITS_PER_LONG) {
125 mask = ~0UL >> (BITS_PER_LONG - len);
126 *data |= mask;
127 return;
128 }
129 }
130
131 memset(data, 0xff, len / BITS_PER_BYTE);
132 data += len / BITS_PER_LONG;
133
134 len %= BITS_PER_LONG;
135 if (len)
136 *data |= ~0UL >> (BITS_PER_LONG - len);
137}
138
139/**
140 * nft_pipapo_avx2_refill() - Scan bitmap, select mapping table item, set bits
141 * @offset: Start from given bitmap (equivalent to bucket) offset, in longs
142 * @map: Bitmap to be scanned for set bits
143 * @dst: Destination bitmap
144 * @mt: Mapping table containing bit set specifiers
145 * @len: Length of bitmap in longs
146 * @last: Return index of first set bit, if this is the last field
147 *
148 * This is an alternative implementation of pipapo_refill() suitable for usage
149 * with AVX2 lookup routines: we know there are four words to be scanned, at
150 * a given offset inside the map, for each matching iteration.
151 *
152 * This function doesn't actually use any AVX2 instruction.
153 *
154 * Return: first set bit index if @last, index of first filled word otherwise.
155 */
156static int nft_pipapo_avx2_refill(int offset, unsigned long *map,
157 unsigned long *dst,
158 union nft_pipapo_map_bucket *mt, bool last)
159{
160 int ret = -1;
161
162#define NFT_PIPAPO_AVX2_REFILL_ONE_WORD(x) \
163 do { \
164 while (map[(x)]) { \
165 int r = __builtin_ctzl(map[(x)]); \
166 int i = (offset + (x)) * BITS_PER_LONG + r; \
167 \
168 if (last) \
169 return i; \
170 \
171 nft_pipapo_avx2_fill(dst, mt[i].to, mt[i].n); \
172 \
173 if (ret == -1) \
174 ret = mt[i].to; \
175 \
176 map[(x)] &= ~(1UL << r); \
177 } \
178 } while (0)
179
180 NFT_PIPAPO_AVX2_REFILL_ONE_WORD(0);
181 NFT_PIPAPO_AVX2_REFILL_ONE_WORD(1);
182 NFT_PIPAPO_AVX2_REFILL_ONE_WORD(2);
183 NFT_PIPAPO_AVX2_REFILL_ONE_WORD(3);
184#undef NFT_PIPAPO_AVX2_REFILL_ONE_WORD
185
186 return ret;
187}
188
189/**
190 * nft_pipapo_avx2_lookup_4b_2() - AVX2-based lookup for 2 four-bit groups
191 * @map: Previous match result, used as initial bitmap
192 * @fill: Destination bitmap to be filled with current match result
193 * @f: Field, containing lookup and mapping tables
194 * @offset: Ignore buckets before the given index, no bits are filled there
195 * @pkt: Packet data, pointer to input nftables register
196 * @first: If this is the first field, don't source previous result
197 * @last: Last field: stop at the first match and return bit index
198 *
199 * Load buckets from lookup table corresponding to the values of each 4-bit
200 * group of packet bytes, and perform a bitwise intersection between them. If
201 * this is the first field in the set, simply AND the buckets together
202 * (equivalent to using an all-ones starting bitmap), use the provided starting
203 * bitmap otherwise. Then call nft_pipapo_avx2_refill() to generate the next
204 * working bitmap, @fill.
205 *
206 * This is used for 8-bit fields (i.e. protocol numbers).
207 *
208 * Out-of-order (and superscalar) execution is vital here, so it's critical to
209 * avoid false data dependencies. CPU and compiler could (mostly) take care of
210 * this on their own, but the operation ordering is explicitly given here with
211 * a likely execution order in mind, to highlight possible stalls. That's why
212 * a number of logically distinct operations (i.e. loading buckets, intersecting
213 * buckets) are interleaved.
214 *
215 * Return: -1 on no match, rule index of match if @last, otherwise first long
216 * word index to be checked next (i.e. first filled word).
217 */
218static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill,
219 struct nft_pipapo_field *f, int offset,
220 const u8 *pkt, bool first, bool last)
221{
222 int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
223 u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf };
224 unsigned long *lt = f->lt, bsize = f->bsize;
225
226 lt += offset * NFT_PIPAPO_LONGS_PER_M256;
227 for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
228 int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
229
230 if (first) {
231 NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
232 NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
233 NFT_PIPAPO_AVX2_AND(4, 0, 1);
234 } else {
235 NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
236 NFT_PIPAPO_AVX2_LOAD(2, map[i_ul]);
237 NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
238 NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nothing);
239 NFT_PIPAPO_AVX2_AND(3, 0, 1);
240 NFT_PIPAPO_AVX2_AND(4, 2, 3);
241 }
242
243 NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
244 NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
245
246 b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
247 if (last)
248 return b;
249
250 if (unlikely(ret == -1))
251 ret = b / XSAVE_YMM_SIZE;
252
253 continue;
254nomatch:
255 NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
256nothing:
257 ;
258 }
259
260 return ret;
261}
262
263/**
264 * nft_pipapo_avx2_lookup_4b_4() - AVX2-based lookup for 4 four-bit groups
265 * @map: Previous match result, used as initial bitmap
266 * @fill: Destination bitmap to be filled with current match result
267 * @f: Field, containing lookup and mapping tables
268 * @offset: Ignore buckets before the given index, no bits are filled there
269 * @pkt: Packet data, pointer to input nftables register
270 * @first: If this is the first field, don't source previous result
271 * @last: Last field: stop at the first match and return bit index
272 *
273 * See nft_pipapo_avx2_lookup_4b_2().
274 *
275 * This is used for 16-bit fields (i.e. ports).
276 *
277 * Return: -1 on no match, rule index of match if @last, otherwise first long
278 * word index to be checked next (i.e. first filled word).
279 */
280static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill,
281 struct nft_pipapo_field *f, int offset,
282 const u8 *pkt, bool first, bool last)
283{
284 int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
285 u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf };
286 unsigned long *lt = f->lt, bsize = f->bsize;
287
288 lt += offset * NFT_PIPAPO_LONGS_PER_M256;
289 for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
290 int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
291
292 if (first) {
293 NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
294 NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
295 NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize);
296 NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize);
297 NFT_PIPAPO_AVX2_AND(4, 0, 1);
298 NFT_PIPAPO_AVX2_AND(5, 2, 3);
299 NFT_PIPAPO_AVX2_AND(7, 4, 5);
300 } else {
301 NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
302
303 NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
304
305 NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
306 NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
307 NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
308 NFT_PIPAPO_AVX2_AND(5, 0, 1);
309
310 NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
311
312 NFT_PIPAPO_AVX2_AND(6, 2, 3);
313 NFT_PIPAPO_AVX2_AND(7, 4, 5);
314 /* Stall */
315 NFT_PIPAPO_AVX2_AND(7, 6, 7);
316 }
317
318 /* Stall */
319 NFT_PIPAPO_AVX2_NOMATCH_GOTO(7, nomatch);
320 NFT_PIPAPO_AVX2_STORE(map[i_ul], 7);
321
322 b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
323 if (last)
324 return b;
325
326 if (unlikely(ret == -1))
327 ret = b / XSAVE_YMM_SIZE;
328
329 continue;
330nomatch:
331 NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
332nothing:
333 ;
334 }
335
336 return ret;
337}
338
339/**
340 * nft_pipapo_avx2_lookup_4b_8() - AVX2-based lookup for 8 four-bit groups
341 * @map: Previous match result, used as initial bitmap
342 * @fill: Destination bitmap to be filled with current match result
343 * @f: Field, containing lookup and mapping tables
344 * @offset: Ignore buckets before the given index, no bits are filled there
345 * @pkt: Packet data, pointer to input nftables register
346 * @first: If this is the first field, don't source previous result
347 * @last: Last field: stop at the first match and return bit index
348 *
349 * See nft_pipapo_avx2_lookup_4b_2().
350 *
351 * This is used for 32-bit fields (i.e. IPv4 addresses).
352 *
353 * Return: -1 on no match, rule index of match if @last, otherwise first long
354 * word index to be checked next (i.e. first filled word).
355 */
356static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill,
357 struct nft_pipapo_field *f, int offset,
358 const u8 *pkt, bool first, bool last)
359{
360 u8 pg[8] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
361 pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
362 };
363 int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
364 unsigned long *lt = f->lt, bsize = f->bsize;
365
366 lt += offset * NFT_PIPAPO_LONGS_PER_M256;
367 for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
368 int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
369
370 if (first) {
371 NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
372 NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
373 NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize);
374 NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize);
375 NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 4, pg[4], bsize);
376 NFT_PIPAPO_AVX2_AND(5, 0, 1);
377 NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 5, pg[5], bsize);
378 NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 6, pg[6], bsize);
379 NFT_PIPAPO_AVX2_AND(8, 2, 3);
380 NFT_PIPAPO_AVX2_AND(9, 4, 5);
381 NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize);
382 NFT_PIPAPO_AVX2_AND(11, 6, 7);
383 NFT_PIPAPO_AVX2_AND(12, 8, 9);
384 NFT_PIPAPO_AVX2_AND(13, 10, 11);
385
386 /* Stall */
387 NFT_PIPAPO_AVX2_AND(1, 12, 13);
388 } else {
389 NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
390 NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
391 NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
392 NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
393 NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
394
395 NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
396
397 NFT_PIPAPO_AVX2_AND(5, 0, 1);
398 NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize);
399 NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
400 NFT_PIPAPO_AVX2_AND(8, 2, 3);
401 NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize);
402 NFT_PIPAPO_AVX2_AND(10, 4, 5);
403 NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize);
404 NFT_PIPAPO_AVX2_AND(12, 6, 7);
405 NFT_PIPAPO_AVX2_AND(13, 8, 9);
406 NFT_PIPAPO_AVX2_AND(14, 10, 11);
407
408 /* Stall */
409 NFT_PIPAPO_AVX2_AND(1, 12, 13);
410 NFT_PIPAPO_AVX2_AND(1, 1, 14);
411 }
412
413 NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nomatch);
414 NFT_PIPAPO_AVX2_STORE(map[i_ul], 1);
415
416 b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
417 if (last)
418 return b;
419
420 if (unlikely(ret == -1))
421 ret = b / XSAVE_YMM_SIZE;
422
423 continue;
424
425nomatch:
426 NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
427nothing:
428 ;
429 }
430
431 return ret;
432}
433
434/**
435 * nft_pipapo_avx2_lookup_4b_12() - AVX2-based lookup for 12 four-bit groups
436 * @map: Previous match result, used as initial bitmap
437 * @fill: Destination bitmap to be filled with current match result
438 * @f: Field, containing lookup and mapping tables
439 * @offset: Ignore buckets before the given index, no bits are filled there
440 * @pkt: Packet data, pointer to input nftables register
441 * @first: If this is the first field, don't source previous result
442 * @last: Last field: stop at the first match and return bit index
443 *
444 * See nft_pipapo_avx2_lookup_4b_2().
445 *
446 * This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
447 *
448 * Return: -1 on no match, rule index of match if @last, otherwise first long
449 * word index to be checked next (i.e. first filled word).
450 */
451static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill,
452 struct nft_pipapo_field *f, int offset,
453 const u8 *pkt, bool first, bool last)
454{
455 u8 pg[12] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
456 pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
457 pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf,
458 };
459 int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
460 unsigned long *lt = f->lt, bsize = f->bsize;
461
462 lt += offset * NFT_PIPAPO_LONGS_PER_M256;
463 for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
464 int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
465
466 if (!first)
467 NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
468
469 NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize);
470 NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
471 NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
472
473 if (!first) {
474 NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
475 NFT_PIPAPO_AVX2_AND(1, 1, 0);
476 }
477
478 NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
479 NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 4, pg[4], bsize);
480 NFT_PIPAPO_AVX2_AND(6, 2, 3);
481 NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
482 NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 6, pg[6], bsize);
483 NFT_PIPAPO_AVX2_AND(9, 1, 4);
484 NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize);
485 NFT_PIPAPO_AVX2_AND(11, 5, 6);
486 NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 8, pg[8], bsize);
487 NFT_PIPAPO_AVX2_AND(13, 7, 8);
488 NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 9, pg[9], bsize);
489
490 NFT_PIPAPO_AVX2_AND(0, 9, 10);
491 NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 10, pg[10], bsize);
492 NFT_PIPAPO_AVX2_AND(2, 11, 12);
493 NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize);
494 NFT_PIPAPO_AVX2_AND(4, 13, 14);
495 NFT_PIPAPO_AVX2_AND(5, 0, 1);
496
497 NFT_PIPAPO_AVX2_AND(6, 2, 3);
498
499 /* Stalls */
500 NFT_PIPAPO_AVX2_AND(7, 4, 5);
501 NFT_PIPAPO_AVX2_AND(8, 6, 7);
502
503 NFT_PIPAPO_AVX2_NOMATCH_GOTO(8, nomatch);
504 NFT_PIPAPO_AVX2_STORE(map[i_ul], 8);
505
506 b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
507 if (last)
508 return b;
509
510 if (unlikely(ret == -1))
511 ret = b / XSAVE_YMM_SIZE;
512
513 continue;
514nomatch:
515 NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
516nothing:
517 ;
518 }
519
520 return ret;
521}
522
523/**
524 * nft_pipapo_avx2_lookup_4b_32() - AVX2-based lookup for 32 four-bit groups
525 * @map: Previous match result, used as initial bitmap
526 * @fill: Destination bitmap to be filled with current match result
527 * @f: Field, containing lookup and mapping tables
528 * @offset: Ignore buckets before the given index, no bits are filled there
529 * @pkt: Packet data, pointer to input nftables register
530 * @first: If this is the first field, don't source previous result
531 * @last: Last field: stop at the first match and return bit index
532 *
533 * See nft_pipapo_avx2_lookup_4b_2().
534 *
535 * This is used for 128-bit fields (i.e. IPv6 addresses).
536 *
537 * Return: -1 on no match, rule index of match if @last, otherwise first long
538 * word index to be checked next (i.e. first filled word).
539 */
540static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill,
541 struct nft_pipapo_field *f, int offset,
542 const u8 *pkt, bool first, bool last)
543{
544 u8 pg[32] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
545 pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
546 pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf,
547 pkt[6] >> 4, pkt[6] & 0xf, pkt[7] >> 4, pkt[7] & 0xf,
548 pkt[8] >> 4, pkt[8] & 0xf, pkt[9] >> 4, pkt[9] & 0xf,
549 pkt[10] >> 4, pkt[10] & 0xf, pkt[11] >> 4, pkt[11] & 0xf,
550 pkt[12] >> 4, pkt[12] & 0xf, pkt[13] >> 4, pkt[13] & 0xf,
551 pkt[14] >> 4, pkt[14] & 0xf, pkt[15] >> 4, pkt[15] & 0xf,
552 };
553 int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
554 unsigned long *lt = f->lt, bsize = f->bsize;
555
556 lt += offset * NFT_PIPAPO_LONGS_PER_M256;
557 for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
558 int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
559
560 if (!first)
561 NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
562
563 NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize);
564 NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
565 NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
566 NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
567 if (!first) {
568 NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
569 NFT_PIPAPO_AVX2_AND(1, 1, 0);
570 }
571
572 NFT_PIPAPO_AVX2_AND(5, 2, 3);
573 NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize);
574 NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
575 NFT_PIPAPO_AVX2_AND(8, 1, 4);
576 NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize);
577 NFT_PIPAPO_AVX2_AND(10, 5, 6);
578 NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize);
579 NFT_PIPAPO_AVX2_AND(12, 7, 8);
580 NFT_PIPAPO_AVX2_BUCKET_LOAD4(13, lt, 8, pg[8], bsize);
581 NFT_PIPAPO_AVX2_AND(14, 9, 10);
582
583 NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 9, pg[9], bsize);
584 NFT_PIPAPO_AVX2_AND(1, 11, 12);
585 NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 10, pg[10], bsize);
586 NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize);
587 NFT_PIPAPO_AVX2_AND(4, 13, 14);
588 NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 12, pg[12], bsize);
589 NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 13, pg[13], bsize);
590 NFT_PIPAPO_AVX2_AND(7, 0, 1);
591 NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 14, pg[14], bsize);
592 NFT_PIPAPO_AVX2_AND(9, 2, 3);
593 NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 15, pg[15], bsize);
594 NFT_PIPAPO_AVX2_AND(11, 4, 5);
595 NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 16, pg[16], bsize);
596 NFT_PIPAPO_AVX2_AND(13, 6, 7);
597 NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 17, pg[17], bsize);
598
599 NFT_PIPAPO_AVX2_AND(0, 8, 9);
600 NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 18, pg[18], bsize);
601 NFT_PIPAPO_AVX2_AND(2, 10, 11);
602 NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 19, pg[19], bsize);
603 NFT_PIPAPO_AVX2_AND(4, 12, 13);
604 NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 20, pg[20], bsize);
605 NFT_PIPAPO_AVX2_AND(6, 14, 0);
606 NFT_PIPAPO_AVX2_AND(7, 1, 2);
607 NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 21, pg[21], bsize);
608 NFT_PIPAPO_AVX2_AND(9, 3, 4);
609 NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 22, pg[22], bsize);
610 NFT_PIPAPO_AVX2_AND(11, 5, 6);
611 NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 23, pg[23], bsize);
612 NFT_PIPAPO_AVX2_AND(13, 7, 8);
613
614 NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 24, pg[24], bsize);
615 NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 25, pg[25], bsize);
616 NFT_PIPAPO_AVX2_AND(1, 9, 10);
617 NFT_PIPAPO_AVX2_AND(2, 11, 12);
618 NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 26, pg[26], bsize);
619 NFT_PIPAPO_AVX2_AND(4, 13, 14);
620 NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 27, pg[27], bsize);
621 NFT_PIPAPO_AVX2_AND(6, 0, 1);
622 NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 28, pg[28], bsize);
623 NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 29, pg[29], bsize);
624 NFT_PIPAPO_AVX2_AND(9, 2, 3);
625 NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 30, pg[30], bsize);
626 NFT_PIPAPO_AVX2_AND(11, 4, 5);
627 NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 31, pg[31], bsize);
628
629 NFT_PIPAPO_AVX2_AND(0, 6, 7);
630 NFT_PIPAPO_AVX2_AND(1, 8, 9);
631 NFT_PIPAPO_AVX2_AND(2, 10, 11);
632 NFT_PIPAPO_AVX2_AND(3, 12, 0);
633
634 /* Stalls */
635 NFT_PIPAPO_AVX2_AND(4, 1, 2);
636 NFT_PIPAPO_AVX2_AND(5, 3, 4);
637
638 NFT_PIPAPO_AVX2_NOMATCH_GOTO(5, nomatch);
639 NFT_PIPAPO_AVX2_STORE(map[i_ul], 5);
640
641 b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
642 if (last)
643 return b;
644
645 if (unlikely(ret == -1))
646 ret = b / XSAVE_YMM_SIZE;
647
648 continue;
649nomatch:
650 NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
651nothing:
652 ;
653 }
654
655 return ret;
656}
657
658/**
659 * nft_pipapo_avx2_lookup_8b_1() - AVX2-based lookup for one eight-bit group
660 * @map: Previous match result, used as initial bitmap
661 * @fill: Destination bitmap to be filled with current match result
662 * @f: Field, containing lookup and mapping tables
663 * @offset: Ignore buckets before the given index, no bits are filled there
664 * @pkt: Packet data, pointer to input nftables register
665 * @first: If this is the first field, don't source previous result
666 * @last: Last field: stop at the first match and return bit index
667 *
668 * See nft_pipapo_avx2_lookup_4b_2().
669 *
670 * This is used for 8-bit fields (i.e. protocol numbers).
671 *
672 * Return: -1 on no match, rule index of match if @last, otherwise first long
673 * word index to be checked next (i.e. first filled word).
674 */
675static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill,
676 struct nft_pipapo_field *f, int offset,
677 const u8 *pkt, bool first, bool last)
678{
679 int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
680 unsigned long *lt = f->lt, bsize = f->bsize;
681
682 lt += offset * NFT_PIPAPO_LONGS_PER_M256;
683 for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
684 int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
685
686 if (first) {
687 NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 0, pkt[0], bsize);
688 } else {
689 NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
690 NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
691 NFT_PIPAPO_AVX2_AND(2, 0, 1);
692 NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
693 }
694
695 NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nomatch);
696 NFT_PIPAPO_AVX2_STORE(map[i_ul], 2);
697
698 b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
699 if (last)
700 return b;
701
702 if (unlikely(ret == -1))
703 ret = b / XSAVE_YMM_SIZE;
704
705 continue;
706nomatch:
707 NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
708nothing:
709 ;
710 }
711
712 return ret;
713}
714
715/**
716 * nft_pipapo_avx2_lookup_8b_2() - AVX2-based lookup for 2 eight-bit groups
717 * @map: Previous match result, used as initial bitmap
718 * @fill: Destination bitmap to be filled with current match result
719 * @f: Field, containing lookup and mapping tables
720 * @offset: Ignore buckets before the given index, no bits are filled there
721 * @pkt: Packet data, pointer to input nftables register
722 * @first: If this is the first field, don't source previous result
723 * @last: Last field: stop at the first match and return bit index
724 *
725 * See nft_pipapo_avx2_lookup_4b_2().
726 *
727 * This is used for 16-bit fields (i.e. ports).
728 *
729 * Return: -1 on no match, rule index of match if @last, otherwise first long
730 * word index to be checked next (i.e. first filled word).
731 */
732static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill,
733 struct nft_pipapo_field *f, int offset,
734 const u8 *pkt, bool first, bool last)
735{
736 int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
737 unsigned long *lt = f->lt, bsize = f->bsize;
738
739 lt += offset * NFT_PIPAPO_LONGS_PER_M256;
740 for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
741 int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
742
743 if (first) {
744 NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
745 NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
746 NFT_PIPAPO_AVX2_AND(4, 0, 1);
747 } else {
748 NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
749 NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize);
750 NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
751
752 /* Stall */
753 NFT_PIPAPO_AVX2_AND(3, 0, 1);
754 NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
755 NFT_PIPAPO_AVX2_AND(4, 3, 2);
756 }
757
758 /* Stall */
759 NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
760 NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
761
762 b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
763 if (last)
764 return b;
765
766 if (unlikely(ret == -1))
767 ret = b / XSAVE_YMM_SIZE;
768
769 continue;
770nomatch:
771 NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
772nothing:
773 ;
774 }
775
776 return ret;
777}
778
779/**
780 * nft_pipapo_avx2_lookup_8b_4() - AVX2-based lookup for 4 eight-bit groups
781 * @map: Previous match result, used as initial bitmap
782 * @fill: Destination bitmap to be filled with current match result
783 * @f: Field, containing lookup and mapping tables
784 * @offset: Ignore buckets before the given index, no bits are filled there
785 * @pkt: Packet data, pointer to input nftables register
786 * @first: If this is the first field, don't source previous result
787 * @last: Last field: stop at the first match and return bit index
788 *
789 * See nft_pipapo_avx2_lookup_4b_2().
790 *
791 * This is used for 32-bit fields (i.e. IPv4 addresses).
792 *
793 * Return: -1 on no match, rule index of match if @last, otherwise first long
794 * word index to be checked next (i.e. first filled word).
795 */
796static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill,
797 struct nft_pipapo_field *f, int offset,
798 const u8 *pkt, bool first, bool last)
799{
800 int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
801 unsigned long *lt = f->lt, bsize = f->bsize;
802
803 lt += offset * NFT_PIPAPO_LONGS_PER_M256;
804 for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
805 int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
806
807 if (first) {
808 NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
809 NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
810 NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize);
811 NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize);
812
813 /* Stall */
814 NFT_PIPAPO_AVX2_AND(4, 0, 1);
815 NFT_PIPAPO_AVX2_AND(5, 2, 3);
816 NFT_PIPAPO_AVX2_AND(0, 4, 5);
817 } else {
818 NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
819 NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
820 NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
821 NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
822 NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
823
824 NFT_PIPAPO_AVX2_AND(5, 0, 1);
825 NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
826 NFT_PIPAPO_AVX2_AND(6, 2, 3);
827
828 /* Stall */
829 NFT_PIPAPO_AVX2_AND(7, 4, 5);
830 NFT_PIPAPO_AVX2_AND(0, 6, 7);
831 }
832
833 NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nomatch);
834 NFT_PIPAPO_AVX2_STORE(map[i_ul], 0);
835
836 b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
837 if (last)
838 return b;
839
840 if (unlikely(ret == -1))
841 ret = b / XSAVE_YMM_SIZE;
842
843 continue;
844
845nomatch:
846 NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
847nothing:
848 ;
849 }
850
851 return ret;
852}
853
854/**
855 * nft_pipapo_avx2_lookup_8b_6() - AVX2-based lookup for 6 eight-bit groups
856 * @map: Previous match result, used as initial bitmap
857 * @fill: Destination bitmap to be filled with current match result
858 * @f: Field, containing lookup and mapping tables
859 * @offset: Ignore buckets before the given index, no bits are filled there
860 * @pkt: Packet data, pointer to input nftables register
861 * @first: If this is the first field, don't source previous result
862 * @last: Last field: stop at the first match and return bit index
863 *
864 * See nft_pipapo_avx2_lookup_4b_2().
865 *
866 * This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
867 *
868 * Return: -1 on no match, rule index of match if @last, otherwise first long
869 * word index to be checked next (i.e. first filled word).
870 */
871static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill,
872 struct nft_pipapo_field *f, int offset,
873 const u8 *pkt, bool first, bool last)
874{
875 int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
876 unsigned long *lt = f->lt, bsize = f->bsize;
877
878 lt += offset * NFT_PIPAPO_LONGS_PER_M256;
879 for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
880 int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
881
882 if (first) {
883 NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
884 NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
885 NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize);
886 NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize);
887 NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 4, pkt[4], bsize);
888
889 NFT_PIPAPO_AVX2_AND(5, 0, 1);
890 NFT_PIPAPO_AVX2_BUCKET_LOAD8(6, lt, 6, pkt[5], bsize);
891 NFT_PIPAPO_AVX2_AND(7, 2, 3);
892
893 /* Stall */
894 NFT_PIPAPO_AVX2_AND(0, 4, 5);
895 NFT_PIPAPO_AVX2_AND(1, 6, 7);
896 NFT_PIPAPO_AVX2_AND(4, 0, 1);
897 } else {
898 NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
899 NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
900 NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
901 NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
902 NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
903
904 NFT_PIPAPO_AVX2_AND(5, 0, 1);
905 NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
906
907 NFT_PIPAPO_AVX2_AND(6, 2, 3);
908 NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 4, pkt[4], bsize);
909 NFT_PIPAPO_AVX2_AND(0, 4, 5);
910 NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 5, pkt[5], bsize);
911 NFT_PIPAPO_AVX2_AND(2, 6, 7);
912
913 /* Stall */
914 NFT_PIPAPO_AVX2_AND(3, 0, 1);
915 NFT_PIPAPO_AVX2_AND(4, 2, 3);
916 }
917
918 NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
919 NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
920
921 b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
922 if (last)
923 return b;
924
925 if (unlikely(ret == -1))
926 ret = b / XSAVE_YMM_SIZE;
927
928 continue;
929
930nomatch:
931 NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
932nothing:
933 ;
934 }
935
936 return ret;
937}
938
939/**
940 * nft_pipapo_avx2_lookup_8b_16() - AVX2-based lookup for 16 eight-bit groups
941 * @map: Previous match result, used as initial bitmap
942 * @fill: Destination bitmap to be filled with current match result
943 * @f: Field, containing lookup and mapping tables
944 * @offset: Ignore buckets before the given index, no bits are filled there
945 * @pkt: Packet data, pointer to input nftables register
946 * @first: If this is the first field, don't source previous result
947 * @last: Last field: stop at the first match and return bit index
948 *
949 * See nft_pipapo_avx2_lookup_4b_2().
950 *
951 * This is used for 128-bit fields (i.e. IPv6 addresses).
952 *
953 * Return: -1 on no match, rule index of match if @last, otherwise first long
954 * word index to be checked next (i.e. first filled word).
955 */
956static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
957 struct nft_pipapo_field *f, int offset,
958 const u8 *pkt, bool first, bool last)
959{
960 int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
961 unsigned long *lt = f->lt, bsize = f->bsize;
962
963 lt += offset * NFT_PIPAPO_LONGS_PER_M256;
964 for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
965 int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
966
967 if (!first)
968 NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
969
970 NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize);
971 NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
972 NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
973 if (!first) {
974 NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
975 NFT_PIPAPO_AVX2_AND(1, 1, 0);
976 }
977 NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
978
979 NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 4, pkt[4], bsize);
980 NFT_PIPAPO_AVX2_AND(6, 1, 2);
981 NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 5, pkt[5], bsize);
982 NFT_PIPAPO_AVX2_AND(0, 3, 4);
983 NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 6, pkt[6], bsize);
984
985 NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 7, pkt[7], bsize);
986 NFT_PIPAPO_AVX2_AND(3, 5, 6);
987 NFT_PIPAPO_AVX2_AND(4, 0, 1);
988 NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize);
989
990 NFT_PIPAPO_AVX2_AND(6, 2, 3);
991 NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize);
992 NFT_PIPAPO_AVX2_AND(0, 4, 5);
993 NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize);
994 NFT_PIPAPO_AVX2_AND(2, 6, 7);
995 NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize);
996 NFT_PIPAPO_AVX2_AND(4, 0, 1);
997 NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 12, pkt[12], bsize);
998 NFT_PIPAPO_AVX2_AND(6, 2, 3);
999 NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 13, pkt[13], bsize);
1000 NFT_PIPAPO_AVX2_AND(0, 4, 5);
1001 NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 14, pkt[14], bsize);
1002 NFT_PIPAPO_AVX2_AND(2, 6, 7);
1003 NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 15, pkt[15], bsize);
1004 NFT_PIPAPO_AVX2_AND(4, 0, 1);
1005
1006 /* Stall */
1007 NFT_PIPAPO_AVX2_AND(5, 2, 3);
1008 NFT_PIPAPO_AVX2_AND(6, 4, 5);
1009
1010 NFT_PIPAPO_AVX2_NOMATCH_GOTO(6, nomatch);
1011 NFT_PIPAPO_AVX2_STORE(map[i_ul], 6);
1012
1013 b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
1014 if (last)
1015 return b;
1016
1017 if (unlikely(ret == -1))
1018 ret = b / XSAVE_YMM_SIZE;
1019
1020 continue;
1021
1022nomatch:
1023 NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
1024nothing:
1025 ;
1026 }
1027
1028 return ret;
1029}
1030
1031/**
1032 * nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes
1033 * @map: Previous match result, used as initial bitmap
1034 * @fill: Destination bitmap to be filled with current match result
1035 * @f: Field, containing lookup and mapping tables
1036 * @offset: Ignore buckets before the given index, no bits are filled there
1037 * @pkt: Packet data, pointer to input nftables register
1038 * @first: If this is the first field, don't source previous result
1039 * @last: Last field: stop at the first match and return bit index
1040 *
1041 * This function should never be called, but is provided for the case the field
1042 * size doesn't match any of the known data types. Matching rate is
1043 * substantially lower than AVX2 routines.
1044 *
1045 * Return: -1 on no match, rule index of match if @last, otherwise first long
1046 * word index to be checked next (i.e. first filled word).
1047 */
1048static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill,
1049 struct nft_pipapo_field *f, int offset,
1050 const u8 *pkt, bool first, bool last)
1051{
1052 unsigned long *lt = f->lt, bsize = f->bsize;
1053 int i, ret = -1, b;
1054
1055 lt += offset * NFT_PIPAPO_LONGS_PER_M256;
1056
1057 if (first)
1058 memset(map, 0xff, bsize * sizeof(*map));
1059
1060 for (i = offset; i < bsize; i++) {
1061 if (f->bb == 8)
1062 pipapo_and_field_buckets_8bit(f, map, pkt);
1063 else
1064 pipapo_and_field_buckets_4bit(f, map, pkt);
1065 NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
1066
1067 b = pipapo_refill(map, bsize, f->rules, fill, f->mt, last);
1068
1069 if (last)
1070 return b;
1071
1072 if (ret == -1)
1073 ret = b / XSAVE_YMM_SIZE;
1074 }
1075
1076 return ret;
1077}
1078
1079/**
1080 * nft_pipapo_avx2_estimate() - Set size, space and lookup complexity
1081 * @desc: Set description, element count and field description used
1082 * @features: Flags: NFT_SET_INTERVAL needs to be there
1083 * @est: Storage for estimation data
1084 *
1085 * Return: true if set is compatible and AVX2 available, false otherwise.
1086 */
1087bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
1088 struct nft_set_estimate *est)
1089{
eb16933a
SB
1090 if (!(features & NFT_SET_INTERVAL) ||
1091 desc->field_count < NFT_PIPAPO_MIN_FIELDS)
7400b063
SB
1092 return false;
1093
1094 if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX))
1095 return false;
1096
1097 est->size = pipapo_estimate_size(desc);
1098 if (!est->size)
1099 return false;
1100
1101 est->lookup = NFT_SET_CLASS_O_LOG_N;
1102
1103 est->space = NFT_SET_CLASS_O_N;
1104
1105 return true;
1106}
1107
1108/**
1109 * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation
1110 * @net: Network namespace
1111 * @set: nftables API set representation
1112 * @elem: nftables API element representation containing key data
1113 * @ext: nftables API extension pointer, filled with matching reference
1114 *
1115 * For more details, see DOC: Theory of Operation in nft_set_pipapo.c.
1116 *
1117 * This implementation exploits the repetitive characteristic of the algorithm
1118 * to provide a fast, vectorised version using the AVX2 SIMD instruction set.
1119 *
1120 * Return: true on match, false otherwise.
1121 */
1122bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
1123 const u32 *key, const struct nft_set_ext **ext)
1124{
1125 struct nft_pipapo *priv = nft_set_priv(set);
1126 unsigned long *res, *fill, *scratch;
1127 u8 genmask = nft_genmask_cur(net);
1128 const u8 *rp = (const u8 *)key;
1129 struct nft_pipapo_match *m;
1130 struct nft_pipapo_field *f;
1131 bool map_index;
1132 int i, ret = 0;
1133
1134 m = rcu_dereference(priv->match);
1135
1136 /* This also protects access to all data related to scratch maps */
1137 kernel_fpu_begin();
1138
1139 scratch = *raw_cpu_ptr(m->scratch_aligned);
1140 if (unlikely(!scratch)) {
1141 kernel_fpu_end();
1142 return false;
1143 }
1144 map_index = raw_cpu_read(nft_pipapo_avx2_scratch_index);
1145
1146 res = scratch + (map_index ? m->bsize_max : 0);
1147 fill = scratch + (map_index ? 0 : m->bsize_max);
1148
1149 /* Starting map doesn't need to be set for this implementation */
1150
1151 nft_pipapo_avx2_prepare();
1152
1153next_match:
1154 nft_pipapo_for_each_field(f, i, m) {
1155 bool last = i == m->field_count - 1, first = !i;
1156
1157#define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \
1158 (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \
1159 ret, rp, \
1160 first, last))
1161
1162 if (likely(f->bb == 8)) {
1163 if (f->groups == 1) {
1164 NFT_SET_PIPAPO_AVX2_LOOKUP(8, 1);
1165 } else if (f->groups == 2) {
1166 NFT_SET_PIPAPO_AVX2_LOOKUP(8, 2);
1167 } else if (f->groups == 4) {
1168 NFT_SET_PIPAPO_AVX2_LOOKUP(8, 4);
1169 } else if (f->groups == 6) {
1170 NFT_SET_PIPAPO_AVX2_LOOKUP(8, 6);
1171 } else if (f->groups == 16) {
1172 NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16);
1173 } else {
1174 ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
1175 ret, rp,
1176 first, last);
1177 }
1178 } else {
1179 if (f->groups == 2) {
1180 NFT_SET_PIPAPO_AVX2_LOOKUP(4, 2);
1181 } else if (f->groups == 4) {
1182 NFT_SET_PIPAPO_AVX2_LOOKUP(4, 4);
1183 } else if (f->groups == 8) {
1184 NFT_SET_PIPAPO_AVX2_LOOKUP(4, 8);
1185 } else if (f->groups == 12) {
1186 NFT_SET_PIPAPO_AVX2_LOOKUP(4, 12);
1187 } else if (f->groups == 32) {
1188 NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32);
1189 } else {
1190 ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
1191 ret, rp,
1192 first, last);
1193 }
1194 }
1195 NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
1196
1197#undef NFT_SET_PIPAPO_AVX2_LOOKUP
1198
1199 if (ret < 0)
1200 goto out;
1201
1202 if (last) {
1203 *ext = &f->mt[ret].e->ext;
1204 if (unlikely(nft_set_elem_expired(*ext) ||
1205 !nft_set_elem_active(*ext, genmask))) {
1206 ret = 0;
1207 goto next_match;
1208 }
1209
1210 goto out;
1211 }
1212
1213 swap(res, fill);
1214 rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
1215 }
1216
1217out:
1218 if (i % 2)
1219 raw_cpu_write(nft_pipapo_avx2_scratch_index, !map_index);
1220 kernel_fpu_end();
1221
1222 return ret >= 0;
1223}