| 1 | // SPDX-License-Identifier: GPL-2.0-only |
| 2 | /* |
| 3 | * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> |
| 4 | * |
| 5 | * Development of this code funded by Astaro AG (http://www.astaro.com/) |
| 6 | */ |
| 7 | |
| 8 | #include <linux/kernel.h> |
| 9 | #include <linux/init.h> |
| 10 | #include <linux/module.h> |
| 11 | #include <linux/list.h> |
| 12 | #include <linux/rbtree.h> |
| 13 | #include <linux/netlink.h> |
| 14 | #include <linux/netfilter.h> |
| 15 | #include <linux/netfilter/nf_tables.h> |
| 16 | #include <net/netfilter/nf_tables_core.h> |
| 17 | |
| 18 | struct nft_rbtree { |
| 19 | struct rb_root root; |
| 20 | rwlock_t lock; |
| 21 | seqcount_rwlock_t count; |
| 22 | unsigned long last_gc; |
| 23 | }; |
| 24 | |
| 25 | struct nft_rbtree_elem { |
| 26 | struct nft_elem_priv priv; |
| 27 | struct rb_node node; |
| 28 | struct nft_set_ext ext; |
| 29 | }; |
| 30 | |
| 31 | static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe) |
| 32 | { |
| 33 | return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) && |
| 34 | (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END); |
| 35 | } |
| 36 | |
| 37 | static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe) |
| 38 | { |
| 39 | return !nft_rbtree_interval_end(rbe); |
| 40 | } |
| 41 | |
| 42 | static int nft_rbtree_cmp(const struct nft_set *set, |
| 43 | const struct nft_rbtree_elem *e1, |
| 44 | const struct nft_rbtree_elem *e2) |
| 45 | { |
| 46 | return memcmp(nft_set_ext_key(&e1->ext), nft_set_ext_key(&e2->ext), |
| 47 | set->klen); |
| 48 | } |
| 49 | |
| 50 | static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) |
| 51 | { |
| 52 | return nft_set_elem_expired(&rbe->ext); |
| 53 | } |
| 54 | |
| 55 | static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, |
| 56 | const u32 *key, const struct nft_set_ext **ext, |
| 57 | unsigned int seq) |
| 58 | { |
| 59 | struct nft_rbtree *priv = nft_set_priv(set); |
| 60 | const struct nft_rbtree_elem *rbe, *interval = NULL; |
| 61 | u8 genmask = nft_genmask_cur(net); |
| 62 | const struct rb_node *parent; |
| 63 | int d; |
| 64 | |
| 65 | parent = rcu_dereference_raw(priv->root.rb_node); |
| 66 | while (parent != NULL) { |
| 67 | if (read_seqcount_retry(&priv->count, seq)) |
| 68 | return false; |
| 69 | |
| 70 | rbe = rb_entry(parent, struct nft_rbtree_elem, node); |
| 71 | |
| 72 | d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen); |
| 73 | if (d < 0) { |
| 74 | parent = rcu_dereference_raw(parent->rb_left); |
| 75 | if (interval && |
| 76 | !nft_rbtree_cmp(set, rbe, interval) && |
| 77 | nft_rbtree_interval_end(rbe) && |
| 78 | nft_rbtree_interval_start(interval)) |
| 79 | continue; |
| 80 | interval = rbe; |
| 81 | } else if (d > 0) |
| 82 | parent = rcu_dereference_raw(parent->rb_right); |
| 83 | else { |
| 84 | if (!nft_set_elem_active(&rbe->ext, genmask)) { |
| 85 | parent = rcu_dereference_raw(parent->rb_left); |
| 86 | continue; |
| 87 | } |
| 88 | |
| 89 | if (nft_rbtree_elem_expired(rbe)) |
| 90 | return false; |
| 91 | |
| 92 | if (nft_rbtree_interval_end(rbe)) { |
| 93 | if (nft_set_is_anonymous(set)) |
| 94 | return false; |
| 95 | parent = rcu_dereference_raw(parent->rb_left); |
| 96 | interval = NULL; |
| 97 | continue; |
| 98 | } |
| 99 | |
| 100 | *ext = &rbe->ext; |
| 101 | return true; |
| 102 | } |
| 103 | } |
| 104 | |
| 105 | if (set->flags & NFT_SET_INTERVAL && interval != NULL && |
| 106 | nft_set_elem_active(&interval->ext, genmask) && |
| 107 | !nft_rbtree_elem_expired(interval) && |
| 108 | nft_rbtree_interval_start(interval)) { |
| 109 | *ext = &interval->ext; |
| 110 | return true; |
| 111 | } |
| 112 | |
| 113 | return false; |
| 114 | } |
| 115 | |
| 116 | INDIRECT_CALLABLE_SCOPE |
| 117 | bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, |
| 118 | const u32 *key, const struct nft_set_ext **ext) |
| 119 | { |
| 120 | struct nft_rbtree *priv = nft_set_priv(set); |
| 121 | unsigned int seq = read_seqcount_begin(&priv->count); |
| 122 | bool ret; |
| 123 | |
| 124 | ret = __nft_rbtree_lookup(net, set, key, ext, seq); |
| 125 | if (ret || !read_seqcount_retry(&priv->count, seq)) |
| 126 | return ret; |
| 127 | |
| 128 | read_lock_bh(&priv->lock); |
| 129 | seq = read_seqcount_begin(&priv->count); |
| 130 | ret = __nft_rbtree_lookup(net, set, key, ext, seq); |
| 131 | read_unlock_bh(&priv->lock); |
| 132 | |
| 133 | return ret; |
| 134 | } |
| 135 | |
| 136 | static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, |
| 137 | const u32 *key, struct nft_rbtree_elem **elem, |
| 138 | unsigned int seq, unsigned int flags, u8 genmask) |
| 139 | { |
| 140 | struct nft_rbtree_elem *rbe, *interval = NULL; |
| 141 | struct nft_rbtree *priv = nft_set_priv(set); |
| 142 | const struct rb_node *parent; |
| 143 | const void *this; |
| 144 | int d; |
| 145 | |
| 146 | parent = rcu_dereference_raw(priv->root.rb_node); |
| 147 | while (parent != NULL) { |
| 148 | if (read_seqcount_retry(&priv->count, seq)) |
| 149 | return false; |
| 150 | |
| 151 | rbe = rb_entry(parent, struct nft_rbtree_elem, node); |
| 152 | |
| 153 | this = nft_set_ext_key(&rbe->ext); |
| 154 | d = memcmp(this, key, set->klen); |
| 155 | if (d < 0) { |
| 156 | parent = rcu_dereference_raw(parent->rb_left); |
| 157 | if (!(flags & NFT_SET_ELEM_INTERVAL_END)) |
| 158 | interval = rbe; |
| 159 | } else if (d > 0) { |
| 160 | parent = rcu_dereference_raw(parent->rb_right); |
| 161 | if (flags & NFT_SET_ELEM_INTERVAL_END) |
| 162 | interval = rbe; |
| 163 | } else { |
| 164 | if (!nft_set_elem_active(&rbe->ext, genmask)) { |
| 165 | parent = rcu_dereference_raw(parent->rb_left); |
| 166 | continue; |
| 167 | } |
| 168 | |
| 169 | if (nft_set_elem_expired(&rbe->ext)) |
| 170 | return false; |
| 171 | |
| 172 | if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) || |
| 173 | (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) == |
| 174 | (flags & NFT_SET_ELEM_INTERVAL_END)) { |
| 175 | *elem = rbe; |
| 176 | return true; |
| 177 | } |
| 178 | |
| 179 | if (nft_rbtree_interval_end(rbe)) |
| 180 | interval = NULL; |
| 181 | |
| 182 | parent = rcu_dereference_raw(parent->rb_left); |
| 183 | } |
| 184 | } |
| 185 | |
| 186 | if (set->flags & NFT_SET_INTERVAL && interval != NULL && |
| 187 | nft_set_elem_active(&interval->ext, genmask) && |
| 188 | !nft_set_elem_expired(&interval->ext) && |
| 189 | ((!nft_rbtree_interval_end(interval) && |
| 190 | !(flags & NFT_SET_ELEM_INTERVAL_END)) || |
| 191 | (nft_rbtree_interval_end(interval) && |
| 192 | (flags & NFT_SET_ELEM_INTERVAL_END)))) { |
| 193 | *elem = interval; |
| 194 | return true; |
| 195 | } |
| 196 | |
| 197 | return false; |
| 198 | } |
| 199 | |
| 200 | static struct nft_elem_priv * |
| 201 | nft_rbtree_get(const struct net *net, const struct nft_set *set, |
| 202 | const struct nft_set_elem *elem, unsigned int flags) |
| 203 | { |
| 204 | struct nft_rbtree *priv = nft_set_priv(set); |
| 205 | unsigned int seq = read_seqcount_begin(&priv->count); |
| 206 | struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT); |
| 207 | const u32 *key = (const u32 *)&elem->key.val; |
| 208 | u8 genmask = nft_genmask_cur(net); |
| 209 | bool ret; |
| 210 | |
| 211 | ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); |
| 212 | if (ret || !read_seqcount_retry(&priv->count, seq)) |
| 213 | return &rbe->priv; |
| 214 | |
| 215 | read_lock_bh(&priv->lock); |
| 216 | seq = read_seqcount_begin(&priv->count); |
| 217 | ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); |
| 218 | read_unlock_bh(&priv->lock); |
| 219 | |
| 220 | if (!ret) |
| 221 | return ERR_PTR(-ENOENT); |
| 222 | |
| 223 | return &rbe->priv; |
| 224 | } |
| 225 | |
| 226 | static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set, |
| 227 | struct nft_rbtree *priv, |
| 228 | struct nft_rbtree_elem *rbe) |
| 229 | { |
| 230 | lockdep_assert_held_write(&priv->lock); |
| 231 | nft_setelem_data_deactivate(net, set, &rbe->priv); |
| 232 | rb_erase(&rbe->node, &priv->root); |
| 233 | } |
| 234 | |
| 235 | static const struct nft_rbtree_elem * |
| 236 | nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, |
| 237 | struct nft_rbtree_elem *rbe, u8 genmask) |
| 238 | { |
| 239 | struct nft_set *set = (struct nft_set *)__set; |
| 240 | struct rb_node *prev = rb_prev(&rbe->node); |
| 241 | struct net *net = read_pnet(&set->net); |
| 242 | struct nft_rbtree_elem *rbe_prev; |
| 243 | struct nft_trans_gc *gc; |
| 244 | |
| 245 | gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); |
| 246 | if (!gc) |
| 247 | return ERR_PTR(-ENOMEM); |
| 248 | |
| 249 | /* search for end interval coming before this element. |
| 250 | * end intervals don't carry a timeout extension, they |
| 251 | * are coupled with the interval start element. |
| 252 | */ |
| 253 | while (prev) { |
| 254 | rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); |
| 255 | if (nft_rbtree_interval_end(rbe_prev) && |
| 256 | nft_set_elem_active(&rbe_prev->ext, genmask)) |
| 257 | break; |
| 258 | |
| 259 | prev = rb_prev(prev); |
| 260 | } |
| 261 | |
| 262 | rbe_prev = NULL; |
| 263 | if (prev) { |
| 264 | rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); |
| 265 | nft_rbtree_gc_elem_remove(net, set, priv, rbe_prev); |
| 266 | |
| 267 | /* There is always room in this trans gc for this element, |
| 268 | * memory allocation never actually happens, hence, the warning |
| 269 | * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT, |
| 270 | * this is synchronous gc which never fails. |
| 271 | */ |
| 272 | gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); |
| 273 | if (WARN_ON_ONCE(!gc)) |
| 274 | return ERR_PTR(-ENOMEM); |
| 275 | |
| 276 | nft_trans_gc_elem_add(gc, rbe_prev); |
| 277 | } |
| 278 | |
| 279 | nft_rbtree_gc_elem_remove(net, set, priv, rbe); |
| 280 | gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); |
| 281 | if (WARN_ON_ONCE(!gc)) |
| 282 | return ERR_PTR(-ENOMEM); |
| 283 | |
| 284 | nft_trans_gc_elem_add(gc, rbe); |
| 285 | |
| 286 | nft_trans_gc_queue_sync_done(gc); |
| 287 | |
| 288 | return rbe_prev; |
| 289 | } |
| 290 | |
| 291 | static bool nft_rbtree_update_first(const struct nft_set *set, |
| 292 | struct nft_rbtree_elem *rbe, |
| 293 | struct rb_node *first) |
| 294 | { |
| 295 | struct nft_rbtree_elem *first_elem; |
| 296 | |
| 297 | first_elem = rb_entry(first, struct nft_rbtree_elem, node); |
| 298 | /* this element is closest to where the new element is to be inserted: |
| 299 | * update the first element for the node list path. |
| 300 | */ |
| 301 | if (nft_rbtree_cmp(set, rbe, first_elem) < 0) |
| 302 | return true; |
| 303 | |
| 304 | return false; |
| 305 | } |
| 306 | |
| 307 | static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, |
| 308 | struct nft_rbtree_elem *new, |
| 309 | struct nft_elem_priv **elem_priv) |
| 310 | { |
| 311 | struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; |
| 312 | struct rb_node *node, *next, *parent, **p, *first = NULL; |
| 313 | struct nft_rbtree *priv = nft_set_priv(set); |
| 314 | u8 cur_genmask = nft_genmask_cur(net); |
| 315 | u8 genmask = nft_genmask_next(net); |
| 316 | int d; |
| 317 | |
| 318 | /* Descend the tree to search for an existing element greater than the |
| 319 | * key value to insert that is greater than the new element. This is the |
| 320 | * first element to walk the ordered elements to find possible overlap. |
| 321 | */ |
| 322 | parent = NULL; |
| 323 | p = &priv->root.rb_node; |
| 324 | while (*p != NULL) { |
| 325 | parent = *p; |
| 326 | rbe = rb_entry(parent, struct nft_rbtree_elem, node); |
| 327 | d = nft_rbtree_cmp(set, rbe, new); |
| 328 | |
| 329 | if (d < 0) { |
| 330 | p = &parent->rb_left; |
| 331 | } else if (d > 0) { |
| 332 | if (!first || |
| 333 | nft_rbtree_update_first(set, rbe, first)) |
| 334 | first = &rbe->node; |
| 335 | |
| 336 | p = &parent->rb_right; |
| 337 | } else { |
| 338 | if (nft_rbtree_interval_end(rbe)) |
| 339 | p = &parent->rb_left; |
| 340 | else |
| 341 | p = &parent->rb_right; |
| 342 | } |
| 343 | } |
| 344 | |
| 345 | if (!first) |
| 346 | first = rb_first(&priv->root); |
| 347 | |
| 348 | /* Detect overlap by going through the list of valid tree nodes. |
| 349 | * Values stored in the tree are in reversed order, starting from |
| 350 | * highest to lowest value. |
| 351 | */ |
| 352 | for (node = first; node != NULL; node = next) { |
| 353 | next = rb_next(node); |
| 354 | |
| 355 | rbe = rb_entry(node, struct nft_rbtree_elem, node); |
| 356 | |
| 357 | if (!nft_set_elem_active(&rbe->ext, genmask)) |
| 358 | continue; |
| 359 | |
| 360 | /* perform garbage collection to avoid bogus overlap reports |
| 361 | * but skip new elements in this transaction. |
| 362 | */ |
| 363 | if (nft_set_elem_expired(&rbe->ext) && |
| 364 | nft_set_elem_active(&rbe->ext, cur_genmask)) { |
| 365 | const struct nft_rbtree_elem *removed_end; |
| 366 | |
| 367 | removed_end = nft_rbtree_gc_elem(set, priv, rbe, genmask); |
| 368 | if (IS_ERR(removed_end)) |
| 369 | return PTR_ERR(removed_end); |
| 370 | |
| 371 | if (removed_end == rbe_le || removed_end == rbe_ge) |
| 372 | return -EAGAIN; |
| 373 | |
| 374 | continue; |
| 375 | } |
| 376 | |
| 377 | d = nft_rbtree_cmp(set, rbe, new); |
| 378 | if (d == 0) { |
| 379 | /* Matching end element: no need to look for an |
| 380 | * overlapping greater or equal element. |
| 381 | */ |
| 382 | if (nft_rbtree_interval_end(rbe)) { |
| 383 | rbe_le = rbe; |
| 384 | break; |
| 385 | } |
| 386 | |
| 387 | /* first element that is greater or equal to key value. */ |
| 388 | if (!rbe_ge) { |
| 389 | rbe_ge = rbe; |
| 390 | continue; |
| 391 | } |
| 392 | |
| 393 | /* this is a closer more or equal element, update it. */ |
| 394 | if (nft_rbtree_cmp(set, rbe_ge, new) != 0) { |
| 395 | rbe_ge = rbe; |
| 396 | continue; |
| 397 | } |
| 398 | |
| 399 | /* element is equal to key value, make sure flags are |
| 400 | * the same, an existing more or equal start element |
| 401 | * must not be replaced by more or equal end element. |
| 402 | */ |
| 403 | if ((nft_rbtree_interval_start(new) && |
| 404 | nft_rbtree_interval_start(rbe_ge)) || |
| 405 | (nft_rbtree_interval_end(new) && |
| 406 | nft_rbtree_interval_end(rbe_ge))) { |
| 407 | rbe_ge = rbe; |
| 408 | continue; |
| 409 | } |
| 410 | } else if (d > 0) { |
| 411 | /* annotate element greater than the new element. */ |
| 412 | rbe_ge = rbe; |
| 413 | continue; |
| 414 | } else if (d < 0) { |
| 415 | /* annotate element less than the new element. */ |
| 416 | rbe_le = rbe; |
| 417 | break; |
| 418 | } |
| 419 | } |
| 420 | |
| 421 | /* - new start element matching existing start element: full overlap |
| 422 | * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. |
| 423 | */ |
| 424 | if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) && |
| 425 | nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) { |
| 426 | *elem_priv = &rbe_ge->priv; |
| 427 | return -EEXIST; |
| 428 | } |
| 429 | |
| 430 | /* - new end element matching existing end element: full overlap |
| 431 | * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. |
| 432 | */ |
| 433 | if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) && |
| 434 | nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) { |
| 435 | *elem_priv = &rbe_le->priv; |
| 436 | return -EEXIST; |
| 437 | } |
| 438 | |
| 439 | /* - new start element with existing closest, less or equal key value |
| 440 | * being a start element: partial overlap, reported as -ENOTEMPTY. |
| 441 | * Anonymous sets allow for two consecutive start element since they |
| 442 | * are constant, skip them to avoid bogus overlap reports. |
| 443 | */ |
| 444 | if (!nft_set_is_anonymous(set) && rbe_le && |
| 445 | nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) |
| 446 | return -ENOTEMPTY; |
| 447 | |
| 448 | /* - new end element with existing closest, less or equal key value |
| 449 | * being a end element: partial overlap, reported as -ENOTEMPTY. |
| 450 | */ |
| 451 | if (rbe_le && |
| 452 | nft_rbtree_interval_end(rbe_le) && nft_rbtree_interval_end(new)) |
| 453 | return -ENOTEMPTY; |
| 454 | |
| 455 | /* - new end element with existing closest, greater or equal key value |
| 456 | * being an end element: partial overlap, reported as -ENOTEMPTY |
| 457 | */ |
| 458 | if (rbe_ge && |
| 459 | nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new)) |
| 460 | return -ENOTEMPTY; |
| 461 | |
| 462 | /* Accepted element: pick insertion point depending on key value */ |
| 463 | parent = NULL; |
| 464 | p = &priv->root.rb_node; |
| 465 | while (*p != NULL) { |
| 466 | parent = *p; |
| 467 | rbe = rb_entry(parent, struct nft_rbtree_elem, node); |
| 468 | d = nft_rbtree_cmp(set, rbe, new); |
| 469 | |
| 470 | if (d < 0) |
| 471 | p = &parent->rb_left; |
| 472 | else if (d > 0) |
| 473 | p = &parent->rb_right; |
| 474 | else if (nft_rbtree_interval_end(rbe)) |
| 475 | p = &parent->rb_left; |
| 476 | else |
| 477 | p = &parent->rb_right; |
| 478 | } |
| 479 | |
| 480 | rb_link_node_rcu(&new->node, parent, p); |
| 481 | rb_insert_color(&new->node, &priv->root); |
| 482 | return 0; |
| 483 | } |
| 484 | |
| 485 | static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, |
| 486 | const struct nft_set_elem *elem, |
| 487 | struct nft_elem_priv **elem_priv) |
| 488 | { |
| 489 | struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv); |
| 490 | struct nft_rbtree *priv = nft_set_priv(set); |
| 491 | int err; |
| 492 | |
| 493 | do { |
| 494 | if (fatal_signal_pending(current)) |
| 495 | return -EINTR; |
| 496 | |
| 497 | cond_resched(); |
| 498 | |
| 499 | write_lock_bh(&priv->lock); |
| 500 | write_seqcount_begin(&priv->count); |
| 501 | err = __nft_rbtree_insert(net, set, rbe, elem_priv); |
| 502 | write_seqcount_end(&priv->count); |
| 503 | write_unlock_bh(&priv->lock); |
| 504 | } while (err == -EAGAIN); |
| 505 | |
| 506 | return err; |
| 507 | } |
| 508 | |
| 509 | static void nft_rbtree_erase(struct nft_rbtree *priv, struct nft_rbtree_elem *rbe) |
| 510 | { |
| 511 | write_lock_bh(&priv->lock); |
| 512 | write_seqcount_begin(&priv->count); |
| 513 | rb_erase(&rbe->node, &priv->root); |
| 514 | write_seqcount_end(&priv->count); |
| 515 | write_unlock_bh(&priv->lock); |
| 516 | } |
| 517 | |
| 518 | static void nft_rbtree_remove(const struct net *net, |
| 519 | const struct nft_set *set, |
| 520 | struct nft_elem_priv *elem_priv) |
| 521 | { |
| 522 | struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); |
| 523 | struct nft_rbtree *priv = nft_set_priv(set); |
| 524 | |
| 525 | nft_rbtree_erase(priv, rbe); |
| 526 | } |
| 527 | |
| 528 | static void nft_rbtree_activate(const struct net *net, |
| 529 | const struct nft_set *set, |
| 530 | struct nft_elem_priv *elem_priv) |
| 531 | { |
| 532 | struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); |
| 533 | |
| 534 | nft_set_elem_change_active(net, set, &rbe->ext); |
| 535 | } |
| 536 | |
| 537 | static void nft_rbtree_flush(const struct net *net, |
| 538 | const struct nft_set *set, |
| 539 | struct nft_elem_priv *elem_priv) |
| 540 | { |
| 541 | struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); |
| 542 | |
| 543 | nft_set_elem_change_active(net, set, &rbe->ext); |
| 544 | } |
| 545 | |
| 546 | static struct nft_elem_priv * |
| 547 | nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, |
| 548 | const struct nft_set_elem *elem) |
| 549 | { |
| 550 | struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(elem->priv); |
| 551 | const struct nft_rbtree *priv = nft_set_priv(set); |
| 552 | const struct rb_node *parent = priv->root.rb_node; |
| 553 | u8 genmask = nft_genmask_next(net); |
| 554 | int d; |
| 555 | |
| 556 | while (parent != NULL) { |
| 557 | rbe = rb_entry(parent, struct nft_rbtree_elem, node); |
| 558 | |
| 559 | d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val, |
| 560 | set->klen); |
| 561 | if (d < 0) |
| 562 | parent = parent->rb_left; |
| 563 | else if (d > 0) |
| 564 | parent = parent->rb_right; |
| 565 | else { |
| 566 | if (nft_rbtree_interval_end(rbe) && |
| 567 | nft_rbtree_interval_start(this)) { |
| 568 | parent = parent->rb_left; |
| 569 | continue; |
| 570 | } else if (nft_rbtree_interval_start(rbe) && |
| 571 | nft_rbtree_interval_end(this)) { |
| 572 | parent = parent->rb_right; |
| 573 | continue; |
| 574 | } else if (nft_set_elem_expired(&rbe->ext)) { |
| 575 | break; |
| 576 | } else if (!nft_set_elem_active(&rbe->ext, genmask)) { |
| 577 | parent = parent->rb_left; |
| 578 | continue; |
| 579 | } |
| 580 | nft_rbtree_flush(net, set, &rbe->priv); |
| 581 | return &rbe->priv; |
| 582 | } |
| 583 | } |
| 584 | return NULL; |
| 585 | } |
| 586 | |
| 587 | static void nft_rbtree_walk(const struct nft_ctx *ctx, |
| 588 | struct nft_set *set, |
| 589 | struct nft_set_iter *iter) |
| 590 | { |
| 591 | struct nft_rbtree *priv = nft_set_priv(set); |
| 592 | struct nft_rbtree_elem *rbe; |
| 593 | struct rb_node *node; |
| 594 | |
| 595 | read_lock_bh(&priv->lock); |
| 596 | for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { |
| 597 | rbe = rb_entry(node, struct nft_rbtree_elem, node); |
| 598 | |
| 599 | if (iter->count < iter->skip) |
| 600 | goto cont; |
| 601 | if (!nft_set_elem_active(&rbe->ext, iter->genmask)) |
| 602 | goto cont; |
| 603 | |
| 604 | iter->err = iter->fn(ctx, set, iter, &rbe->priv); |
| 605 | if (iter->err < 0) { |
| 606 | read_unlock_bh(&priv->lock); |
| 607 | return; |
| 608 | } |
| 609 | cont: |
| 610 | iter->count++; |
| 611 | } |
| 612 | read_unlock_bh(&priv->lock); |
| 613 | } |
| 614 | |
| 615 | static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, |
| 616 | struct nft_rbtree *priv, |
| 617 | struct nft_rbtree_elem *rbe) |
| 618 | { |
| 619 | nft_setelem_data_deactivate(net, set, &rbe->priv); |
| 620 | nft_rbtree_erase(priv, rbe); |
| 621 | } |
| 622 | |
| 623 | static void nft_rbtree_gc(struct nft_set *set) |
| 624 | { |
| 625 | struct nft_rbtree *priv = nft_set_priv(set); |
| 626 | struct nft_rbtree_elem *rbe, *rbe_end = NULL; |
| 627 | struct nftables_pernet *nft_net; |
| 628 | struct rb_node *node, *next; |
| 629 | struct nft_trans_gc *gc; |
| 630 | struct net *net; |
| 631 | |
| 632 | set = nft_set_container_of(priv); |
| 633 | net = read_pnet(&set->net); |
| 634 | nft_net = nft_pernet(net); |
| 635 | |
| 636 | gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); |
| 637 | if (!gc) |
| 638 | return; |
| 639 | |
| 640 | for (node = rb_first(&priv->root); node ; node = next) { |
| 641 | next = rb_next(node); |
| 642 | |
| 643 | rbe = rb_entry(node, struct nft_rbtree_elem, node); |
| 644 | |
| 645 | /* elements are reversed in the rbtree for historical reasons, |
| 646 | * from highest to lowest value, that is why end element is |
| 647 | * always visited before the start element. |
| 648 | */ |
| 649 | if (nft_rbtree_interval_end(rbe)) { |
| 650 | rbe_end = rbe; |
| 651 | continue; |
| 652 | } |
| 653 | if (!nft_set_elem_expired(&rbe->ext)) |
| 654 | continue; |
| 655 | |
| 656 | gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); |
| 657 | if (!gc) |
| 658 | goto try_later; |
| 659 | |
| 660 | /* end element needs to be removed first, it has |
| 661 | * no timeout extension. |
| 662 | */ |
| 663 | if (rbe_end) { |
| 664 | nft_rbtree_gc_remove(net, set, priv, rbe_end); |
| 665 | nft_trans_gc_elem_add(gc, rbe_end); |
| 666 | rbe_end = NULL; |
| 667 | } |
| 668 | |
| 669 | gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); |
| 670 | if (!gc) |
| 671 | goto try_later; |
| 672 | |
| 673 | nft_rbtree_gc_remove(net, set, priv, rbe); |
| 674 | nft_trans_gc_elem_add(gc, rbe); |
| 675 | } |
| 676 | |
| 677 | try_later: |
| 678 | |
| 679 | if (gc) { |
| 680 | gc = nft_trans_gc_catchall_sync(gc); |
| 681 | nft_trans_gc_queue_sync_done(gc); |
| 682 | priv->last_gc = jiffies; |
| 683 | } |
| 684 | } |
| 685 | |
| 686 | static u64 nft_rbtree_privsize(const struct nlattr * const nla[], |
| 687 | const struct nft_set_desc *desc) |
| 688 | { |
| 689 | return sizeof(struct nft_rbtree); |
| 690 | } |
| 691 | |
| 692 | static int nft_rbtree_init(const struct nft_set *set, |
| 693 | const struct nft_set_desc *desc, |
| 694 | const struct nlattr * const nla[]) |
| 695 | { |
| 696 | struct nft_rbtree *priv = nft_set_priv(set); |
| 697 | |
| 698 | BUILD_BUG_ON(offsetof(struct nft_rbtree_elem, priv) != 0); |
| 699 | |
| 700 | rwlock_init(&priv->lock); |
| 701 | seqcount_rwlock_init(&priv->count, &priv->lock); |
| 702 | priv->root = RB_ROOT; |
| 703 | |
| 704 | return 0; |
| 705 | } |
| 706 | |
| 707 | static void nft_rbtree_destroy(const struct nft_ctx *ctx, |
| 708 | const struct nft_set *set) |
| 709 | { |
| 710 | struct nft_rbtree *priv = nft_set_priv(set); |
| 711 | struct nft_rbtree_elem *rbe; |
| 712 | struct rb_node *node; |
| 713 | |
| 714 | while ((node = priv->root.rb_node) != NULL) { |
| 715 | rb_erase(node, &priv->root); |
| 716 | rbe = rb_entry(node, struct nft_rbtree_elem, node); |
| 717 | nf_tables_set_elem_destroy(ctx, set, &rbe->priv); |
| 718 | } |
| 719 | } |
| 720 | |
| 721 | static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, |
| 722 | struct nft_set_estimate *est) |
| 723 | { |
| 724 | if (desc->field_count > 1) |
| 725 | return false; |
| 726 | |
| 727 | if (desc->size) |
| 728 | est->size = sizeof(struct nft_rbtree) + |
| 729 | desc->size * sizeof(struct nft_rbtree_elem); |
| 730 | else |
| 731 | est->size = ~0; |
| 732 | |
| 733 | est->lookup = NFT_SET_CLASS_O_LOG_N; |
| 734 | est->space = NFT_SET_CLASS_O_N; |
| 735 | |
| 736 | return true; |
| 737 | } |
| 738 | |
| 739 | static void nft_rbtree_commit(struct nft_set *set) |
| 740 | { |
| 741 | struct nft_rbtree *priv = nft_set_priv(set); |
| 742 | |
| 743 | if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) |
| 744 | nft_rbtree_gc(set); |
| 745 | } |
| 746 | |
| 747 | static void nft_rbtree_gc_init(const struct nft_set *set) |
| 748 | { |
| 749 | struct nft_rbtree *priv = nft_set_priv(set); |
| 750 | |
| 751 | priv->last_gc = jiffies; |
| 752 | } |
| 753 | |
| 754 | const struct nft_set_type nft_set_rbtree_type = { |
| 755 | .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, |
| 756 | .ops = { |
| 757 | .privsize = nft_rbtree_privsize, |
| 758 | .elemsize = offsetof(struct nft_rbtree_elem, ext), |
| 759 | .estimate = nft_rbtree_estimate, |
| 760 | .init = nft_rbtree_init, |
| 761 | .destroy = nft_rbtree_destroy, |
| 762 | .insert = nft_rbtree_insert, |
| 763 | .remove = nft_rbtree_remove, |
| 764 | .deactivate = nft_rbtree_deactivate, |
| 765 | .flush = nft_rbtree_flush, |
| 766 | .activate = nft_rbtree_activate, |
| 767 | .commit = nft_rbtree_commit, |
| 768 | .gc_init = nft_rbtree_gc_init, |
| 769 | .lookup = nft_rbtree_lookup, |
| 770 | .walk = nft_rbtree_walk, |
| 771 | .get = nft_rbtree_get, |
| 772 | }, |
| 773 | }; |