Commit | Line | Data |
---|---|---|
d2912cb1 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
f6180121 MJ |
2 | /* Event cache for netfilter. */ |
3 | ||
f229f6ce PM |
4 | /* |
5 | * (C) 2005 Harald Welte <laforge@gnumonks.org> | |
6 | * (C) 2005 Patrick McHardy <kaber@trash.net> | |
7 | * (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org> | |
8 | * (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org> | |
f6180121 MJ |
9 | */ |
10 | ||
5191d70f AS |
11 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
12 | ||
f6180121 MJ |
13 | #include <linux/types.h> |
14 | #include <linux/netfilter.h> | |
15 | #include <linux/skbuff.h> | |
16 | #include <linux/vmalloc.h> | |
17 | #include <linux/stddef.h> | |
18 | #include <linux/err.h> | |
19 | #include <linux/percpu.h> | |
f6180121 MJ |
20 | #include <linux/kernel.h> |
21 | #include <linux/netdevice.h> | |
5a0e3ad6 | 22 | #include <linux/slab.h> |
bc3b2d7f | 23 | #include <linux/export.h> |
f6180121 MJ |
24 | |
25 | #include <net/netfilter/nf_conntrack.h> | |
f6180121 | 26 | #include <net/netfilter/nf_conntrack_core.h> |
40d102cd | 27 | #include <net/netfilter/nf_conntrack_ecache.h> |
a0891aa6 | 28 | #include <net/netfilter/nf_conntrack_extend.h> |
f6180121 | 29 | |
e34d5c1a | 30 | static DEFINE_MUTEX(nf_ct_ecache_mutex); |
13b18339 | 31 | |
9500507c | 32 | #define ECACHE_RETRY_WAIT (HZ/10) |
63f55acf | 33 | #define ECACHE_STACK_ALLOC (256 / sizeof(void *)) |
9500507c FW |
34 | |
35 | enum retry_state { | |
36 | STATE_CONGESTED, | |
37 | STATE_RESTART, | |
38 | STATE_DONE, | |
39 | }; | |
40 | ||
41 | static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) | |
42 | { | |
63f55acf FW |
43 | struct nf_conn *refs[ECACHE_STACK_ALLOC]; |
44 | enum retry_state ret = STATE_DONE; | |
9500507c FW |
45 | struct nf_conntrack_tuple_hash *h; |
46 | struct hlist_nulls_node *n; | |
47 | unsigned int evicted = 0; | |
9500507c FW |
48 | |
49 | spin_lock(&pcpu->lock); | |
50 | ||
51 | hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { | |
52 | struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); | |
616b14b4 | 53 | struct nf_conntrack_ecache *e; |
9500507c | 54 | |
616b14b4 FW |
55 | if (!nf_ct_is_confirmed(ct)) |
56 | continue; | |
57 | ||
63f55acf FW |
58 | /* This ecache access is safe because the ct is on the |
59 | * pcpu dying list and we hold the spinlock -- the entry | |
60 | * cannot be free'd until after the lock is released. | |
61 | * | |
62 | * This is true even if ct has a refcount of 0: the | |
63 | * cpu that is about to free the entry must remove it | |
64 | * from the dying list and needs the lock to do so. | |
65 | */ | |
616b14b4 FW |
66 | e = nf_ct_ecache_find(ct); |
67 | if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL) | |
9500507c FW |
68 | continue; |
69 | ||
63f55acf FW |
70 | /* ct is in NFCT_ECACHE_DESTROY_FAIL state, this means |
71 | * the worker owns this entry: the ct will remain valid | |
72 | * until the worker puts its ct reference. | |
73 | */ | |
9500507c FW |
74 | if (nf_conntrack_event(IPCT_DESTROY, ct)) { |
75 | ret = STATE_CONGESTED; | |
76 | break; | |
77 | } | |
78 | ||
616b14b4 | 79 | e->state = NFCT_ECACHE_DESTROY_SENT; |
9500507c FW |
80 | refs[evicted] = ct; |
81 | ||
82 | if (++evicted >= ARRAY_SIZE(refs)) { | |
83 | ret = STATE_RESTART; | |
84 | break; | |
85 | } | |
86 | } | |
87 | ||
88 | spin_unlock(&pcpu->lock); | |
89 | ||
90 | /* can't _put while holding lock */ | |
91 | while (evicted) | |
92 | nf_ct_put(refs[--evicted]); | |
93 | ||
94 | return ret; | |
95 | } | |
96 | ||
97 | static void ecache_work(struct work_struct *work) | |
98 | { | |
99 | struct netns_ct *ctnet = | |
100 | container_of(work, struct netns_ct, ecache_dwork.work); | |
101 | int cpu, delay = -1; | |
102 | struct ct_pcpu *pcpu; | |
103 | ||
104 | local_bh_disable(); | |
105 | ||
106 | for_each_possible_cpu(cpu) { | |
107 | enum retry_state ret; | |
108 | ||
109 | pcpu = per_cpu_ptr(ctnet->pcpu_lists, cpu); | |
110 | ||
111 | ret = ecache_work_evict_list(pcpu); | |
112 | ||
113 | switch (ret) { | |
114 | case STATE_CONGESTED: | |
115 | delay = ECACHE_RETRY_WAIT; | |
116 | goto out; | |
117 | case STATE_RESTART: | |
118 | delay = 0; | |
119 | break; | |
120 | case STATE_DONE: | |
121 | break; | |
122 | } | |
123 | } | |
124 | ||
125 | out: | |
126 | local_bh_enable(); | |
127 | ||
128 | ctnet->ecache_dwork_pending = delay > 0; | |
129 | if (delay >= 0) | |
130 | schedule_delayed_work(&ctnet->ecache_dwork, delay); | |
131 | } | |
132 | ||
3c435e2e FW |
133 | int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, |
134 | u32 portid, int report) | |
135 | { | |
136 | int ret = 0; | |
137 | struct net *net = nf_ct_net(ct); | |
138 | struct nf_ct_event_notifier *notify; | |
139 | struct nf_conntrack_ecache *e; | |
140 | ||
141 | rcu_read_lock(); | |
142 | notify = rcu_dereference(net->ct.nf_conntrack_event_cb); | |
143 | if (!notify) | |
144 | goto out_unlock; | |
145 | ||
146 | e = nf_ct_ecache_find(ct); | |
147 | if (!e) | |
148 | goto out_unlock; | |
149 | ||
616b14b4 | 150 | if (nf_ct_is_confirmed(ct)) { |
3c435e2e FW |
151 | struct nf_ct_event item = { |
152 | .ct = ct, | |
153 | .portid = e->portid ? e->portid : portid, | |
154 | .report = report | |
155 | }; | |
156 | /* This is a resent of a destroy event? If so, skip missed */ | |
157 | unsigned long missed = e->portid ? 0 : e->missed; | |
158 | ||
159 | if (!((eventmask | missed) & e->ctmask)) | |
160 | goto out_unlock; | |
161 | ||
162 | ret = notify->fcn(eventmask | missed, &item); | |
163 | if (unlikely(ret < 0 || missed)) { | |
164 | spin_lock_bh(&ct->lock); | |
165 | if (ret < 0) { | |
166 | /* This is a destroy event that has been | |
167 | * triggered by a process, we store the PORTID | |
168 | * to include it in the retransmission. | |
169 | */ | |
616b14b4 FW |
170 | if (eventmask & (1 << IPCT_DESTROY)) { |
171 | if (e->portid == 0 && portid != 0) | |
172 | e->portid = portid; | |
173 | e->state = NFCT_ECACHE_DESTROY_FAIL; | |
174 | } else { | |
3c435e2e | 175 | e->missed |= eventmask; |
616b14b4 | 176 | } |
3c435e2e FW |
177 | } else { |
178 | e->missed &= ~missed; | |
179 | } | |
180 | spin_unlock_bh(&ct->lock); | |
181 | } | |
182 | } | |
183 | out_unlock: | |
184 | rcu_read_unlock(); | |
185 | return ret; | |
186 | } | |
187 | EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report); | |
188 | ||
f6180121 MJ |
189 | /* deliver cached events and clear cache entry - must be called with locally |
190 | * disabled softirqs */ | |
a0891aa6 | 191 | void nf_ct_deliver_cached_events(struct nf_conn *ct) |
f6180121 | 192 | { |
70e9942f | 193 | struct net *net = nf_ct_net(ct); |
58020f77 | 194 | unsigned long events, missed; |
e34d5c1a | 195 | struct nf_ct_event_notifier *notify; |
a0891aa6 | 196 | struct nf_conntrack_ecache *e; |
58020f77 TZ |
197 | struct nf_ct_event item; |
198 | int ret; | |
e34d5c1a PNA |
199 | |
200 | rcu_read_lock(); | |
70e9942f | 201 | notify = rcu_dereference(net->ct.nf_conntrack_event_cb); |
e34d5c1a PNA |
202 | if (notify == NULL) |
203 | goto out_unlock; | |
204 | ||
ad88b7a6 FW |
205 | if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) |
206 | goto out_unlock; | |
207 | ||
a0891aa6 PNA |
208 | e = nf_ct_ecache_find(ct); |
209 | if (e == NULL) | |
210 | goto out_unlock; | |
211 | ||
212 | events = xchg(&e->cache, 0); | |
213 | ||
58020f77 TZ |
214 | /* We make a copy of the missed event cache without taking |
215 | * the lock, thus we may send missed events twice. However, | |
216 | * this does not harm and it happens very rarely. */ | |
217 | missed = e->missed; | |
218 | ||
219 | if (!((events | missed) & e->ctmask)) | |
220 | goto out_unlock; | |
221 | ||
222 | item.ct = ct; | |
15e47304 | 223 | item.portid = 0; |
58020f77 TZ |
224 | item.report = 0; |
225 | ||
226 | ret = notify->fcn(events | missed, &item); | |
227 | ||
6e354a5e | 228 | if (likely(ret == 0 && !missed)) |
58020f77 TZ |
229 | goto out_unlock; |
230 | ||
231 | spin_lock_bh(&ct->lock); | |
232 | if (ret < 0) | |
233 | e->missed |= events; | |
234 | else | |
235 | e->missed &= ~missed; | |
236 | spin_unlock_bh(&ct->lock); | |
f6180121 | 237 | |
e34d5c1a PNA |
238 | out_unlock: |
239 | rcu_read_unlock(); | |
f6180121 | 240 | } |
13b18339 | 241 | EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); |
f6180121 | 242 | |
ecdfb48c FW |
243 | void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, |
244 | struct nf_conntrack_expect *exp, | |
245 | u32 portid, int report) | |
246 | ||
247 | { | |
248 | struct net *net = nf_ct_exp_net(exp); | |
249 | struct nf_exp_event_notifier *notify; | |
250 | struct nf_conntrack_ecache *e; | |
251 | ||
252 | rcu_read_lock(); | |
253 | notify = rcu_dereference(net->ct.nf_expect_event_cb); | |
254 | if (!notify) | |
255 | goto out_unlock; | |
256 | ||
257 | e = nf_ct_ecache_find(exp->master); | |
258 | if (!e) | |
259 | goto out_unlock; | |
260 | ||
261 | if (e->expmask & (1 << event)) { | |
262 | struct nf_exp_event item = { | |
263 | .exp = exp, | |
264 | .portid = portid, | |
265 | .report = report | |
266 | }; | |
267 | notify->fcn(1 << event, &item); | |
268 | } | |
269 | out_unlock: | |
270 | rcu_read_unlock(); | |
271 | } | |
272 | ||
70e9942f PNA |
273 | int nf_conntrack_register_notifier(struct net *net, |
274 | struct nf_ct_event_notifier *new) | |
010c7d6f | 275 | { |
031d7709 | 276 | int ret; |
b56f2d55 | 277 | struct nf_ct_event_notifier *notify; |
e34d5c1a PNA |
278 | |
279 | mutex_lock(&nf_ct_ecache_mutex); | |
70e9942f | 280 | notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, |
b56f2d55 PM |
281 | lockdep_is_held(&nf_ct_ecache_mutex)); |
282 | if (notify != NULL) { | |
e34d5c1a PNA |
283 | ret = -EBUSY; |
284 | goto out_unlock; | |
285 | } | |
cf778b00 | 286 | rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new); |
031d7709 | 287 | ret = 0; |
e34d5c1a PNA |
288 | |
289 | out_unlock: | |
290 | mutex_unlock(&nf_ct_ecache_mutex); | |
291 | return ret; | |
010c7d6f PM |
292 | } |
293 | EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); | |
294 | ||
70e9942f PNA |
295 | void nf_conntrack_unregister_notifier(struct net *net, |
296 | struct nf_ct_event_notifier *new) | |
010c7d6f | 297 | { |
b56f2d55 PM |
298 | struct nf_ct_event_notifier *notify; |
299 | ||
e34d5c1a | 300 | mutex_lock(&nf_ct_ecache_mutex); |
70e9942f | 301 | notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, |
b56f2d55 PM |
302 | lockdep_is_held(&nf_ct_ecache_mutex)); |
303 | BUG_ON(notify != new); | |
70e9942f | 304 | RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); |
e34d5c1a | 305 | mutex_unlock(&nf_ct_ecache_mutex); |
3b7dabf0 | 306 | /* synchronize_rcu() is called from ctnetlink_exit. */ |
010c7d6f PM |
307 | } |
308 | EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); | |
309 | ||
70e9942f PNA |
310 | int nf_ct_expect_register_notifier(struct net *net, |
311 | struct nf_exp_event_notifier *new) | |
010c7d6f | 312 | { |
031d7709 | 313 | int ret; |
b56f2d55 | 314 | struct nf_exp_event_notifier *notify; |
e34d5c1a PNA |
315 | |
316 | mutex_lock(&nf_ct_ecache_mutex); | |
70e9942f | 317 | notify = rcu_dereference_protected(net->ct.nf_expect_event_cb, |
b56f2d55 PM |
318 | lockdep_is_held(&nf_ct_ecache_mutex)); |
319 | if (notify != NULL) { | |
e34d5c1a PNA |
320 | ret = -EBUSY; |
321 | goto out_unlock; | |
322 | } | |
cf778b00 | 323 | rcu_assign_pointer(net->ct.nf_expect_event_cb, new); |
031d7709 | 324 | ret = 0; |
e34d5c1a PNA |
325 | |
326 | out_unlock: | |
327 | mutex_unlock(&nf_ct_ecache_mutex); | |
328 | return ret; | |
010c7d6f | 329 | } |
6823645d | 330 | EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier); |
010c7d6f | 331 | |
70e9942f PNA |
332 | void nf_ct_expect_unregister_notifier(struct net *net, |
333 | struct nf_exp_event_notifier *new) | |
010c7d6f | 334 | { |
b56f2d55 PM |
335 | struct nf_exp_event_notifier *notify; |
336 | ||
e34d5c1a | 337 | mutex_lock(&nf_ct_ecache_mutex); |
70e9942f | 338 | notify = rcu_dereference_protected(net->ct.nf_expect_event_cb, |
b56f2d55 PM |
339 | lockdep_is_held(&nf_ct_ecache_mutex)); |
340 | BUG_ON(notify != new); | |
70e9942f | 341 | RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL); |
e34d5c1a | 342 | mutex_unlock(&nf_ct_ecache_mutex); |
3b7dabf0 | 343 | /* synchronize_rcu() is called from ctnetlink_exit. */ |
010c7d6f | 344 | } |
6823645d | 345 | EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); |
a0891aa6 PNA |
346 | |
347 | #define NF_CT_EVENTS_DEFAULT 1 | |
348 | static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; | |
349 | ||
23f671a1 | 350 | static const struct nf_ct_ext_type event_extend = { |
a0891aa6 PNA |
351 | .len = sizeof(struct nf_conntrack_ecache), |
352 | .align = __alignof__(struct nf_conntrack_ecache), | |
353 | .id = NF_CT_EXT_ECACHE, | |
354 | }; | |
355 | ||
fc3893fd | 356 | void nf_conntrack_ecache_pernet_init(struct net *net) |
a0891aa6 | 357 | { |
a0891aa6 | 358 | net->ct.sysctl_events = nf_ct_events; |
9500507c | 359 | INIT_DELAYED_WORK(&net->ct.ecache_dwork, ecache_work); |
3fe0f943 | 360 | } |
a0891aa6 | 361 | |
3fe0f943 G |
362 | void nf_conntrack_ecache_pernet_fini(struct net *net) |
363 | { | |
9500507c | 364 | cancel_delayed_work_sync(&net->ct.ecache_dwork); |
3fe0f943 | 365 | } |
a0891aa6 | 366 | |
3fe0f943 G |
367 | int nf_conntrack_ecache_init(void) |
368 | { | |
369 | int ret = nf_ct_extend_register(&event_extend); | |
a0891aa6 | 370 | if (ret < 0) |
5191d70f | 371 | pr_err("Unable to register event extension\n"); |
01026ede FW |
372 | |
373 | BUILD_BUG_ON(__IPCT_MAX >= 16); /* ctmask, missed use u16 */ | |
374 | ||
a0891aa6 PNA |
375 | return ret; |
376 | } | |
377 | ||
3fe0f943 | 378 | void nf_conntrack_ecache_fini(void) |
a0891aa6 | 379 | { |
3fe0f943 | 380 | nf_ct_extend_unregister(&event_extend); |
a0891aa6 | 381 | } |