bpf: add bpf_jit_limit knob to restrict unpriv allocations
[linux-2.6-block.git] / net / netfilter / nf_conncount.c
1 /*
2  * count the number of connections matching an arbitrary key.
3  *
4  * (C) 2017 Red Hat GmbH
5  * Author: Florian Westphal <fw@strlen.de>
6  *
7  * split from xt_connlimit.c:
8  *   (c) 2000 Gerd Knorr <kraxel@bytesex.org>
9  *   Nov 2002: Martin Bene <martin.bene@icomedias.com>:
10  *              only ignore TIME_WAIT or gone connections
11  *   (C) CC Computer Consultants GmbH, 2007
12  */
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 #include <linux/in.h>
15 #include <linux/in6.h>
16 #include <linux/ip.h>
17 #include <linux/ipv6.h>
18 #include <linux/jhash.h>
19 #include <linux/slab.h>
20 #include <linux/list.h>
21 #include <linux/rbtree.h>
22 #include <linux/module.h>
23 #include <linux/random.h>
24 #include <linux/skbuff.h>
25 #include <linux/spinlock.h>
26 #include <linux/netfilter/nf_conntrack_tcp.h>
27 #include <linux/netfilter/x_tables.h>
28 #include <net/netfilter/nf_conntrack.h>
29 #include <net/netfilter/nf_conntrack_count.h>
30 #include <net/netfilter/nf_conntrack_core.h>
31 #include <net/netfilter/nf_conntrack_tuple.h>
32 #include <net/netfilter/nf_conntrack_zones.h>
33
34 #define CONNCOUNT_SLOTS         256U
35
36 #ifdef CONFIG_LOCKDEP
37 #define CONNCOUNT_LOCK_SLOTS    8U
38 #else
39 #define CONNCOUNT_LOCK_SLOTS    256U
40 #endif
41
42 #define CONNCOUNT_GC_MAX_NODES  8
43 #define MAX_KEYLEN              5
44
45 /* we will save the tuples of all connections we care about */
46 struct nf_conncount_tuple {
47         struct list_head                node;
48         struct nf_conntrack_tuple       tuple;
49         struct nf_conntrack_zone        zone;
50         int                             cpu;
51         u32                             jiffies32;
52         struct rcu_head                 rcu_head;
53 };
54
55 struct nf_conncount_rb {
56         struct rb_node node;
57         struct nf_conncount_list list;
58         u32 key[MAX_KEYLEN];
59         struct rcu_head rcu_head;
60 };
61
62 static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp;
63
64 struct nf_conncount_data {
65         unsigned int keylen;
66         struct rb_root root[CONNCOUNT_SLOTS];
67         struct net *net;
68         struct work_struct gc_work;
69         unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)];
70         unsigned int gc_tree;
71 };
72
73 static u_int32_t conncount_rnd __read_mostly;
74 static struct kmem_cache *conncount_rb_cachep __read_mostly;
75 static struct kmem_cache *conncount_conn_cachep __read_mostly;
76
77 static inline bool already_closed(const struct nf_conn *conn)
78 {
79         if (nf_ct_protonum(conn) == IPPROTO_TCP)
80                 return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT ||
81                        conn->proto.tcp.state == TCP_CONNTRACK_CLOSE;
82         else
83                 return false;
84 }
85
86 static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
87 {
88         return memcmp(a, b, klen * sizeof(u32));
89 }
90
91 enum nf_conncount_list_add
92 nf_conncount_add(struct nf_conncount_list *list,
93                  const struct nf_conntrack_tuple *tuple,
94                  const struct nf_conntrack_zone *zone)
95 {
96         struct nf_conncount_tuple *conn;
97
98         if (WARN_ON_ONCE(list->count > INT_MAX))
99                 return NF_CONNCOUNT_ERR;
100
101         conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
102         if (conn == NULL)
103                 return NF_CONNCOUNT_ERR;
104
105         conn->tuple = *tuple;
106         conn->zone = *zone;
107         conn->cpu = raw_smp_processor_id();
108         conn->jiffies32 = (u32)jiffies;
109         spin_lock(&list->list_lock);
110         if (list->dead == true) {
111                 kmem_cache_free(conncount_conn_cachep, conn);
112                 spin_unlock(&list->list_lock);
113                 return NF_CONNCOUNT_SKIP;
114         }
115         list_add_tail(&conn->node, &list->head);
116         list->count++;
117         spin_unlock(&list->list_lock);
118         return NF_CONNCOUNT_ADDED;
119 }
120 EXPORT_SYMBOL_GPL(nf_conncount_add);
121
122 static void __conn_free(struct rcu_head *h)
123 {
124         struct nf_conncount_tuple *conn;
125
126         conn = container_of(h, struct nf_conncount_tuple, rcu_head);
127         kmem_cache_free(conncount_conn_cachep, conn);
128 }
129
130 static bool conn_free(struct nf_conncount_list *list,
131                       struct nf_conncount_tuple *conn)
132 {
133         bool free_entry = false;
134
135         spin_lock(&list->list_lock);
136
137         if (list->count == 0) {
138                 spin_unlock(&list->list_lock);
139                 return free_entry;
140         }
141
142         list->count--;
143         list_del_rcu(&conn->node);
144         if (list->count == 0)
145                 free_entry = true;
146
147         spin_unlock(&list->list_lock);
148         call_rcu(&conn->rcu_head, __conn_free);
149         return free_entry;
150 }
151
152 static const struct nf_conntrack_tuple_hash *
153 find_or_evict(struct net *net, struct nf_conncount_list *list,
154               struct nf_conncount_tuple *conn, bool *free_entry)
155 {
156         const struct nf_conntrack_tuple_hash *found;
157         unsigned long a, b;
158         int cpu = raw_smp_processor_id();
159         __s32 age;
160
161         found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple);
162         if (found)
163                 return found;
164         b = conn->jiffies32;
165         a = (u32)jiffies;
166
167         /* conn might have been added just before by another cpu and
168          * might still be unconfirmed.  In this case, nf_conntrack_find()
169          * returns no result.  Thus only evict if this cpu added the
170          * stale entry or if the entry is older than two jiffies.
171          */
172         age = a - b;
173         if (conn->cpu == cpu || age >= 2) {
174                 *free_entry = conn_free(list, conn);
175                 return ERR_PTR(-ENOENT);
176         }
177
178         return ERR_PTR(-EAGAIN);
179 }
180
181 void nf_conncount_lookup(struct net *net,
182                          struct nf_conncount_list *list,
183                          const struct nf_conntrack_tuple *tuple,
184                          const struct nf_conntrack_zone *zone,
185                          bool *addit)
186 {
187         const struct nf_conntrack_tuple_hash *found;
188         struct nf_conncount_tuple *conn, *conn_n;
189         struct nf_conn *found_ct;
190         unsigned int collect = 0;
191         bool free_entry = false;
192
193         /* best effort only */
194         *addit = tuple ? true : false;
195
196         /* check the saved connections */
197         list_for_each_entry_safe(conn, conn_n, &list->head, node) {
198                 if (collect > CONNCOUNT_GC_MAX_NODES)
199                         break;
200
201                 found = find_or_evict(net, list, conn, &free_entry);
202                 if (IS_ERR(found)) {
203                         /* Not found, but might be about to be confirmed */
204                         if (PTR_ERR(found) == -EAGAIN) {
205                                 if (!tuple)
206                                         continue;
207
208                                 if (nf_ct_tuple_equal(&conn->tuple, tuple) &&
209                                     nf_ct_zone_id(&conn->zone, conn->zone.dir) ==
210                                     nf_ct_zone_id(zone, zone->dir))
211                                         *addit = false;
212                         } else if (PTR_ERR(found) == -ENOENT)
213                                 collect++;
214                         continue;
215                 }
216
217                 found_ct = nf_ct_tuplehash_to_ctrack(found);
218
219                 if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple) &&
220                     nf_ct_zone_equal(found_ct, zone, zone->dir)) {
221                         /*
222                          * We should not see tuples twice unless someone hooks
223                          * this into a table without "-p tcp --syn".
224                          *
225                          * Attempt to avoid a re-add in this case.
226                          */
227                         *addit = false;
228                 } else if (already_closed(found_ct)) {
229                         /*
230                          * we do not care about connections which are
231                          * closed already -> ditch it
232                          */
233                         nf_ct_put(found_ct);
234                         conn_free(list, conn);
235                         collect++;
236                         continue;
237                 }
238
239                 nf_ct_put(found_ct);
240         }
241 }
242 EXPORT_SYMBOL_GPL(nf_conncount_lookup);
243
244 void nf_conncount_list_init(struct nf_conncount_list *list)
245 {
246         spin_lock_init(&list->list_lock);
247         INIT_LIST_HEAD(&list->head);
248         list->count = 1;
249         list->dead = false;
250 }
251 EXPORT_SYMBOL_GPL(nf_conncount_list_init);
252
253 /* Return true if the list is empty */
254 bool nf_conncount_gc_list(struct net *net,
255                           struct nf_conncount_list *list)
256 {
257         const struct nf_conntrack_tuple_hash *found;
258         struct nf_conncount_tuple *conn, *conn_n;
259         struct nf_conn *found_ct;
260         unsigned int collected = 0;
261         bool free_entry = false;
262
263         list_for_each_entry_safe(conn, conn_n, &list->head, node) {
264                 found = find_or_evict(net, list, conn, &free_entry);
265                 if (IS_ERR(found)) {
266                         if (PTR_ERR(found) == -ENOENT)  {
267                                 if (free_entry)
268                                         return true;
269                                 collected++;
270                         }
271                         continue;
272                 }
273
274                 found_ct = nf_ct_tuplehash_to_ctrack(found);
275                 if (already_closed(found_ct)) {
276                         /*
277                          * we do not care about connections which are
278                          * closed already -> ditch it
279                          */
280                         nf_ct_put(found_ct);
281                         if (conn_free(list, conn))
282                                 return true;
283                         collected++;
284                         continue;
285                 }
286
287                 nf_ct_put(found_ct);
288                 if (collected > CONNCOUNT_GC_MAX_NODES)
289                         return false;
290         }
291         return false;
292 }
293 EXPORT_SYMBOL_GPL(nf_conncount_gc_list);
294
295 static void __tree_nodes_free(struct rcu_head *h)
296 {
297         struct nf_conncount_rb *rbconn;
298
299         rbconn = container_of(h, struct nf_conncount_rb, rcu_head);
300         kmem_cache_free(conncount_rb_cachep, rbconn);
301 }
302
303 static void tree_nodes_free(struct rb_root *root,
304                             struct nf_conncount_rb *gc_nodes[],
305                             unsigned int gc_count)
306 {
307         struct nf_conncount_rb *rbconn;
308
309         while (gc_count) {
310                 rbconn = gc_nodes[--gc_count];
311                 spin_lock(&rbconn->list.list_lock);
312                 if (rbconn->list.count == 0 && rbconn->list.dead == false) {
313                         rbconn->list.dead = true;
314                         rb_erase(&rbconn->node, root);
315                         call_rcu(&rbconn->rcu_head, __tree_nodes_free);
316                 }
317                 spin_unlock(&rbconn->list.list_lock);
318         }
319 }
320
321 static void schedule_gc_worker(struct nf_conncount_data *data, int tree)
322 {
323         set_bit(tree, data->pending_trees);
324         schedule_work(&data->gc_work);
325 }
326
327 static unsigned int
328 insert_tree(struct net *net,
329             struct nf_conncount_data *data,
330             struct rb_root *root,
331             unsigned int hash,
332             const u32 *key,
333             u8 keylen,
334             const struct nf_conntrack_tuple *tuple,
335             const struct nf_conntrack_zone *zone)
336 {
337         enum nf_conncount_list_add ret;
338         struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
339         struct rb_node **rbnode, *parent;
340         struct nf_conncount_rb *rbconn;
341         struct nf_conncount_tuple *conn;
342         unsigned int count = 0, gc_count = 0;
343         bool node_found = false;
344
345         spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
346
347         parent = NULL;
348         rbnode = &(root->rb_node);
349         while (*rbnode) {
350                 int diff;
351                 rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node);
352
353                 parent = *rbnode;
354                 diff = key_diff(key, rbconn->key, keylen);
355                 if (diff < 0) {
356                         rbnode = &((*rbnode)->rb_left);
357                 } else if (diff > 0) {
358                         rbnode = &((*rbnode)->rb_right);
359                 } else {
360                         /* unlikely: other cpu added node already */
361                         node_found = true;
362                         ret = nf_conncount_add(&rbconn->list, tuple, zone);
363                         if (ret == NF_CONNCOUNT_ERR) {
364                                 count = 0; /* hotdrop */
365                         } else if (ret == NF_CONNCOUNT_ADDED) {
366                                 count = rbconn->list.count;
367                         } else {
368                                 /* NF_CONNCOUNT_SKIP, rbconn is already
369                                  * reclaimed by gc, insert a new tree node
370                                  */
371                                 node_found = false;
372                         }
373                         break;
374                 }
375
376                 if (gc_count >= ARRAY_SIZE(gc_nodes))
377                         continue;
378
379                 if (nf_conncount_gc_list(net, &rbconn->list))
380                         gc_nodes[gc_count++] = rbconn;
381         }
382
383         if (gc_count) {
384                 tree_nodes_free(root, gc_nodes, gc_count);
385                 /* tree_node_free before new allocation permits
386                  * allocator to re-use newly free'd object.
387                  *
388                  * This is a rare event; in most cases we will find
389                  * existing node to re-use. (or gc_count is 0).
390                  */
391
392                 if (gc_count >= ARRAY_SIZE(gc_nodes))
393                         schedule_gc_worker(data, hash);
394         }
395
396         if (node_found)
397                 goto out_unlock;
398
399         /* expected case: match, insert new node */
400         rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
401         if (rbconn == NULL)
402                 goto out_unlock;
403
404         conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
405         if (conn == NULL) {
406                 kmem_cache_free(conncount_rb_cachep, rbconn);
407                 goto out_unlock;
408         }
409
410         conn->tuple = *tuple;
411         conn->zone = *zone;
412         memcpy(rbconn->key, key, sizeof(u32) * keylen);
413
414         nf_conncount_list_init(&rbconn->list);
415         list_add(&conn->node, &rbconn->list.head);
416         count = 1;
417
418         rb_link_node(&rbconn->node, parent, rbnode);
419         rb_insert_color(&rbconn->node, root);
420 out_unlock:
421         spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
422         return count;
423 }
424
425 static unsigned int
426 count_tree(struct net *net,
427            struct nf_conncount_data *data,
428            const u32 *key,
429            const struct nf_conntrack_tuple *tuple,
430            const struct nf_conntrack_zone *zone)
431 {
432         enum nf_conncount_list_add ret;
433         struct rb_root *root;
434         struct rb_node *parent;
435         struct nf_conncount_rb *rbconn;
436         unsigned int hash;
437         u8 keylen = data->keylen;
438
439         hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS;
440         root = &data->root[hash];
441
442         parent = rcu_dereference_raw(root->rb_node);
443         while (parent) {
444                 int diff;
445                 bool addit;
446
447                 rbconn = rb_entry(parent, struct nf_conncount_rb, node);
448
449                 diff = key_diff(key, rbconn->key, keylen);
450                 if (diff < 0) {
451                         parent = rcu_dereference_raw(parent->rb_left);
452                 } else if (diff > 0) {
453                         parent = rcu_dereference_raw(parent->rb_right);
454                 } else {
455                         /* same source network -> be counted! */
456                         nf_conncount_lookup(net, &rbconn->list, tuple, zone,
457                                             &addit);
458
459                         if (!addit)
460                                 return rbconn->list.count;
461
462                         ret = nf_conncount_add(&rbconn->list, tuple, zone);
463                         if (ret == NF_CONNCOUNT_ERR) {
464                                 return 0; /* hotdrop */
465                         } else if (ret == NF_CONNCOUNT_ADDED) {
466                                 return rbconn->list.count;
467                         } else {
468                                 /* NF_CONNCOUNT_SKIP, rbconn is already
469                                  * reclaimed by gc, insert a new tree node
470                                  */
471                                 break;
472                         }
473                 }
474         }
475
476         if (!tuple)
477                 return 0;
478
479         return insert_tree(net, data, root, hash, key, keylen, tuple, zone);
480 }
481
482 static void tree_gc_worker(struct work_struct *work)
483 {
484         struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work);
485         struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn;
486         struct rb_root *root;
487         struct rb_node *node;
488         unsigned int tree, next_tree, gc_count = 0;
489
490         tree = data->gc_tree % CONNCOUNT_LOCK_SLOTS;
491         root = &data->root[tree];
492
493         rcu_read_lock();
494         for (node = rb_first(root); node != NULL; node = rb_next(node)) {
495                 rbconn = rb_entry(node, struct nf_conncount_rb, node);
496                 if (nf_conncount_gc_list(data->net, &rbconn->list))
497                         gc_nodes[gc_count++] = rbconn;
498         }
499         rcu_read_unlock();
500
501         spin_lock_bh(&nf_conncount_locks[tree]);
502
503         if (gc_count) {
504                 tree_nodes_free(root, gc_nodes, gc_count);
505         }
506
507         clear_bit(tree, data->pending_trees);
508
509         next_tree = (tree + 1) % CONNCOUNT_SLOTS;
510         next_tree = find_next_bit(data->pending_trees, next_tree, CONNCOUNT_SLOTS);
511
512         if (next_tree < CONNCOUNT_SLOTS) {
513                 data->gc_tree = next_tree;
514                 schedule_work(work);
515         }
516
517         spin_unlock_bh(&nf_conncount_locks[tree]);
518 }
519
520 /* Count and return number of conntrack entries in 'net' with particular 'key'.
521  * If 'tuple' is not null, insert it into the accounting data structure.
522  * Call with RCU read lock.
523  */
524 unsigned int nf_conncount_count(struct net *net,
525                                 struct nf_conncount_data *data,
526                                 const u32 *key,
527                                 const struct nf_conntrack_tuple *tuple,
528                                 const struct nf_conntrack_zone *zone)
529 {
530         return count_tree(net, data, key, tuple, zone);
531 }
532 EXPORT_SYMBOL_GPL(nf_conncount_count);
533
534 struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family,
535                                             unsigned int keylen)
536 {
537         struct nf_conncount_data *data;
538         int ret, i;
539
540         if (keylen % sizeof(u32) ||
541             keylen / sizeof(u32) > MAX_KEYLEN ||
542             keylen == 0)
543                 return ERR_PTR(-EINVAL);
544
545         net_get_random_once(&conncount_rnd, sizeof(conncount_rnd));
546
547         data = kmalloc(sizeof(*data), GFP_KERNEL);
548         if (!data)
549                 return ERR_PTR(-ENOMEM);
550
551         ret = nf_ct_netns_get(net, family);
552         if (ret < 0) {
553                 kfree(data);
554                 return ERR_PTR(ret);
555         }
556
557         for (i = 0; i < ARRAY_SIZE(data->root); ++i)
558                 data->root[i] = RB_ROOT;
559
560         data->keylen = keylen / sizeof(u32);
561         data->net = net;
562         INIT_WORK(&data->gc_work, tree_gc_worker);
563
564         return data;
565 }
566 EXPORT_SYMBOL_GPL(nf_conncount_init);
567
568 void nf_conncount_cache_free(struct nf_conncount_list *list)
569 {
570         struct nf_conncount_tuple *conn, *conn_n;
571
572         list_for_each_entry_safe(conn, conn_n, &list->head, node)
573                 kmem_cache_free(conncount_conn_cachep, conn);
574 }
575 EXPORT_SYMBOL_GPL(nf_conncount_cache_free);
576
577 static void destroy_tree(struct rb_root *r)
578 {
579         struct nf_conncount_rb *rbconn;
580         struct rb_node *node;
581
582         while ((node = rb_first(r)) != NULL) {
583                 rbconn = rb_entry(node, struct nf_conncount_rb, node);
584
585                 rb_erase(node, r);
586
587                 nf_conncount_cache_free(&rbconn->list);
588
589                 kmem_cache_free(conncount_rb_cachep, rbconn);
590         }
591 }
592
593 void nf_conncount_destroy(struct net *net, unsigned int family,
594                           struct nf_conncount_data *data)
595 {
596         unsigned int i;
597
598         cancel_work_sync(&data->gc_work);
599         nf_ct_netns_put(net, family);
600
601         for (i = 0; i < ARRAY_SIZE(data->root); ++i)
602                 destroy_tree(&data->root[i]);
603
604         kfree(data);
605 }
606 EXPORT_SYMBOL_GPL(nf_conncount_destroy);
607
608 static int __init nf_conncount_modinit(void)
609 {
610         int i;
611
612         BUILD_BUG_ON(CONNCOUNT_LOCK_SLOTS > CONNCOUNT_SLOTS);
613         BUILD_BUG_ON((CONNCOUNT_SLOTS % CONNCOUNT_LOCK_SLOTS) != 0);
614
615         for (i = 0; i < CONNCOUNT_LOCK_SLOTS; ++i)
616                 spin_lock_init(&nf_conncount_locks[i]);
617
618         conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple",
619                                            sizeof(struct nf_conncount_tuple),
620                                            0, 0, NULL);
621         if (!conncount_conn_cachep)
622                 return -ENOMEM;
623
624         conncount_rb_cachep = kmem_cache_create("nf_conncount_rb",
625                                            sizeof(struct nf_conncount_rb),
626                                            0, 0, NULL);
627         if (!conncount_rb_cachep) {
628                 kmem_cache_destroy(conncount_conn_cachep);
629                 return -ENOMEM;
630         }
631
632         return 0;
633 }
634
635 static void __exit nf_conncount_modexit(void)
636 {
637         kmem_cache_destroy(conncount_conn_cachep);
638         kmem_cache_destroy(conncount_rb_cachep);
639 }
640
641 module_init(nf_conncount_modinit);
642 module_exit(nf_conncount_modexit);
643 MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
644 MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
645 MODULE_DESCRIPTION("netfilter: count number of connections matching a key");
646 MODULE_LICENSE("GPL");