Commit | Line | Data |
---|---|---|
d2912cb1 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
9fb9cbb1 YK |
2 | /* Connection state tracking for netfilter. This is separated from, |
3 | but required by, the NAT layer; it can also be used by an iptables | |
4 | extension. */ | |
5 | ||
6 | /* (C) 1999-2001 Paul `Rusty' Russell | |
dc808fe2 | 7 | * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> |
9fb9cbb1 | 8 | * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> |
f229f6ce | 9 | * (C) 2005-2012 Patrick McHardy <kaber@trash.net> |
9fb9cbb1 YK |
10 | */ |
11 | ||
ccd63c20 WJ |
12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
13 | ||
9fb9cbb1 YK |
14 | #include <linux/types.h> |
15 | #include <linux/netfilter.h> | |
16 | #include <linux/module.h> | |
d43c36dc | 17 | #include <linux/sched.h> |
9fb9cbb1 YK |
18 | #include <linux/skbuff.h> |
19 | #include <linux/proc_fs.h> | |
20 | #include <linux/vmalloc.h> | |
21 | #include <linux/stddef.h> | |
22 | #include <linux/slab.h> | |
23 | #include <linux/random.h> | |
3c791076 | 24 | #include <linux/siphash.h> |
9fb9cbb1 YK |
25 | #include <linux/err.h> |
26 | #include <linux/percpu.h> | |
27 | #include <linux/moduleparam.h> | |
28 | #include <linux/notifier.h> | |
29 | #include <linux/kernel.h> | |
30 | #include <linux/netdevice.h> | |
31 | #include <linux/socket.h> | |
d7fe0f24 | 32 | #include <linux/mm.h> |
d696c7bd | 33 | #include <linux/nsproxy.h> |
ea781f19 | 34 | #include <linux/rculist_nulls.h> |
9fb9cbb1 | 35 | |
9fb9cbb1 | 36 | #include <net/netfilter/nf_conntrack.h> |
b4c2b959 | 37 | #include <net/netfilter/nf_conntrack_bpf.h> |
605dcad6 | 38 | #include <net/netfilter/nf_conntrack_l4proto.h> |
77ab9cff | 39 | #include <net/netfilter/nf_conntrack_expect.h> |
9fb9cbb1 YK |
40 | #include <net/netfilter/nf_conntrack_helper.h> |
41 | #include <net/netfilter/nf_conntrack_core.h> | |
ecfab2c9 | 42 | #include <net/netfilter/nf_conntrack_extend.h> |
58401572 | 43 | #include <net/netfilter/nf_conntrack_acct.h> |
a0891aa6 | 44 | #include <net/netfilter/nf_conntrack_ecache.h> |
5d0aa2cc | 45 | #include <net/netfilter/nf_conntrack_zones.h> |
a992ca2a | 46 | #include <net/netfilter/nf_conntrack_timestamp.h> |
dd705072 | 47 | #include <net/netfilter/nf_conntrack_timeout.h> |
c539f017 | 48 | #include <net/netfilter/nf_conntrack_labels.h> |
48b1de4c | 49 | #include <net/netfilter/nf_conntrack_synproxy.h> |
e6a7d3c0 | 50 | #include <net/netfilter/nf_nat.h> |
49376368 | 51 | #include <net/netfilter/nf_nat_helper.h> |
1b8c8a9f | 52 | #include <net/netns/hash.h> |
6816d931 | 53 | #include <net/ip.h> |
9fb9cbb1 | 54 | |
e2a75007 FW |
55 | #include "nf_internals.h" |
56 | ||
93bb0ceb JDB |
57 | __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; |
58 | EXPORT_SYMBOL_GPL(nf_conntrack_locks); | |
9fb9cbb1 | 59 | |
ca7433df JDB |
60 | __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); |
61 | EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); | |
62 | ||
56d52d48 FW |
63 | struct hlist_nulls_head *nf_conntrack_hash __read_mostly; |
64 | EXPORT_SYMBOL_GPL(nf_conntrack_hash); | |
65 | ||
b87a2f91 FW |
66 | struct conntrack_gc_work { |
67 | struct delayed_work dwork; | |
4608fdfc | 68 | u32 next_bucket; |
2cfadb76 | 69 | u32 avg_timeout; |
95eabdd2 | 70 | u32 count; |
2cfadb76 | 71 | u32 start_time; |
b87a2f91 | 72 | bool exiting; |
c6dd940b | 73 | bool early_drop; |
b87a2f91 FW |
74 | }; |
75 | ||
0c5366b3 | 76 | static __read_mostly struct kmem_cache *nf_conntrack_cachep; |
44b63b0a | 77 | static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); |
b16c2919 SL |
78 | static __read_mostly bool nf_conntrack_locks_all; |
79 | ||
e9edc188 ED |
80 | /* serialize hash resizes and nf_ct_iterate_cleanup */ |
81 | static DEFINE_MUTEX(nf_conntrack_mutex); | |
82 | ||
2cfadb76 FW |
83 | #define GC_SCAN_INTERVAL_MAX (60ul * HZ) |
84 | #define GC_SCAN_INTERVAL_MIN (1ul * HZ) | |
85 | ||
86 | /* clamp timeouts to this value (TCP unacked) */ | |
87 | #define GC_SCAN_INTERVAL_CLAMP (300ul * HZ) | |
88 | ||
2aa19275 AT |
89 | /* Initial bias pretending we have 100 entries at the upper bound so we don't |
90 | * wakeup often just because we have three entries with a 1s timeout while still | |
91 | * allowing non-idle machines to wakeup more often when needed. | |
2cfadb76 | 92 | */ |
2aa19275 AT |
93 | #define GC_SCAN_INITIAL_COUNT 100 |
94 | #define GC_SCAN_INTERVAL_INIT GC_SCAN_INTERVAL_MAX | |
2cfadb76 | 95 | |
4608fdfc | 96 | #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) |
2cfadb76 | 97 | #define GC_SCAN_EXPIRED_MAX (64000u / HZ) |
b87a2f91 | 98 | |
c77737b7 ED |
99 | #define MIN_CHAINLEN 50u |
100 | #define MAX_CHAINLEN (80u - MIN_CHAINLEN) | |
d7e7747a | 101 | |
b87a2f91 FW |
102 | static struct conntrack_gc_work conntrack_gc_work; |
103 | ||
b16c2919 SL |
104 | void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) |
105 | { | |
3ef0c7a7 | 106 | /* 1) Acquire the lock */ |
b16c2919 | 107 | spin_lock(lock); |
b316ff78 | 108 | |
3ef0c7a7 MS |
109 | /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics |
110 | * It pairs with the smp_store_release() in nf_conntrack_all_unlock() | |
111 | */ | |
112 | if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) | |
113 | return; | |
114 | ||
115 | /* fast path failed, unlock */ | |
116 | spin_unlock(lock); | |
117 | ||
118 | /* Slow path 1) get global lock */ | |
119 | spin_lock(&nf_conntrack_locks_all_lock); | |
120 | ||
121 | /* Slow path 2) get the lock we want */ | |
122 | spin_lock(lock); | |
123 | ||
124 | /* Slow path 3) release the global lock */ | |
125 | spin_unlock(&nf_conntrack_locks_all_lock); | |
b16c2919 SL |
126 | } |
127 | EXPORT_SYMBOL_GPL(nf_conntrack_lock); | |
128 | ||
93bb0ceb JDB |
129 | static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) |
130 | { | |
131 | h1 %= CONNTRACK_LOCKS; | |
132 | h2 %= CONNTRACK_LOCKS; | |
133 | spin_unlock(&nf_conntrack_locks[h1]); | |
134 | if (h1 != h2) | |
135 | spin_unlock(&nf_conntrack_locks[h2]); | |
136 | } | |
137 | ||
138 | /* return true if we need to recompute hashes (in case hash table was resized) */ | |
139 | static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, | |
140 | unsigned int h2, unsigned int sequence) | |
141 | { | |
142 | h1 %= CONNTRACK_LOCKS; | |
143 | h2 %= CONNTRACK_LOCKS; | |
144 | if (h1 <= h2) { | |
b16c2919 | 145 | nf_conntrack_lock(&nf_conntrack_locks[h1]); |
93bb0ceb JDB |
146 | if (h1 != h2) |
147 | spin_lock_nested(&nf_conntrack_locks[h2], | |
148 | SINGLE_DEPTH_NESTING); | |
149 | } else { | |
b16c2919 | 150 | nf_conntrack_lock(&nf_conntrack_locks[h2]); |
93bb0ceb JDB |
151 | spin_lock_nested(&nf_conntrack_locks[h1], |
152 | SINGLE_DEPTH_NESTING); | |
153 | } | |
a3efd812 | 154 | if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { |
93bb0ceb JDB |
155 | nf_conntrack_double_unlock(h1, h2); |
156 | return true; | |
157 | } | |
158 | return false; | |
159 | } | |
160 | ||
161 | static void nf_conntrack_all_lock(void) | |
6b36d482 | 162 | __acquires(&nf_conntrack_locks_all_lock) |
93bb0ceb JDB |
163 | { |
164 | int i; | |
165 | ||
b16c2919 | 166 | spin_lock(&nf_conntrack_locks_all_lock); |
b16c2919 | 167 | |
cf4466ea MS |
168 | /* For nf_contrack_locks_all, only the latest time when another |
169 | * CPU will see an update is controlled, by the "release" of the | |
170 | * spin_lock below. | |
171 | * The earliest time is not controlled, an thus KCSAN could detect | |
172 | * a race when nf_conntract_lock() reads the variable. | |
173 | * WRITE_ONCE() is used to ensure the compiler will not | |
174 | * optimize the write. | |
175 | */ | |
176 | WRITE_ONCE(nf_conntrack_locks_all, true); | |
b316ff78 | 177 | |
b16c2919 | 178 | for (i = 0; i < CONNTRACK_LOCKS; i++) { |
3ef0c7a7 MS |
179 | spin_lock(&nf_conntrack_locks[i]); |
180 | ||
181 | /* This spin_unlock provides the "release" to ensure that | |
182 | * nf_conntrack_locks_all==true is visible to everyone that | |
183 | * acquired spin_lock(&nf_conntrack_locks[]). | |
184 | */ | |
185 | spin_unlock(&nf_conntrack_locks[i]); | |
b16c2919 | 186 | } |
93bb0ceb JDB |
187 | } |
188 | ||
189 | static void nf_conntrack_all_unlock(void) | |
6b36d482 | 190 | __releases(&nf_conntrack_locks_all_lock) |
93bb0ceb | 191 | { |
3ef0c7a7 | 192 | /* All prior stores must be complete before we clear |
b316ff78 PZ |
193 | * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() |
194 | * might observe the false value but not the entire | |
3ef0c7a7 MS |
195 | * critical section. |
196 | * It pairs with the smp_load_acquire() in nf_conntrack_lock() | |
b316ff78 PZ |
197 | */ |
198 | smp_store_release(&nf_conntrack_locks_all, false); | |
b16c2919 | 199 | spin_unlock(&nf_conntrack_locks_all_lock); |
93bb0ceb JDB |
200 | } |
201 | ||
e2b7606c | 202 | unsigned int nf_conntrack_htable_size __read_mostly; |
2567c4ea PNA |
203 | EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); |
204 | ||
e478075c | 205 | unsigned int nf_conntrack_max __read_mostly; |
538c5672 | 206 | EXPORT_SYMBOL_GPL(nf_conntrack_max); |
8201d923 | 207 | seqcount_spinlock_t nf_conntrack_generation __read_mostly; |
49ecc2e9 | 208 | static siphash_aligned_key_t nf_conntrack_hash_rnd; |
9fb9cbb1 | 209 | |
1b8c8a9f | 210 | static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, |
b16ac3c4 | 211 | unsigned int zoneid, |
1b8c8a9f | 212 | const struct net *net) |
9fb9cbb1 | 213 | { |
d2c806ab | 214 | u64 a, b, c, d; |
0794935e | 215 | |
141658fb FW |
216 | get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); |
217 | ||
d2c806ab FW |
218 | /* The direction must be ignored, handle usable tuplehash members manually */ |
219 | a = (u64)tuple->src.u3.all[0] << 32 | tuple->src.u3.all[3]; | |
220 | b = (u64)tuple->dst.u3.all[0] << 32 | tuple->dst.u3.all[3]; | |
dd6d2910 | 221 | |
d2c806ab FW |
222 | c = (__force u64)tuple->src.u.all << 32 | (__force u64)tuple->dst.u.all << 16; |
223 | c |= tuple->dst.protonum; | |
dd6d2910 | 224 | |
d2c806ab FW |
225 | d = (u64)zoneid << 32 | net_hash_mix(net); |
226 | ||
227 | /* IPv4: u3.all[1,2,3] == 0 */ | |
228 | c ^= (u64)tuple->src.u3.all[1] << 32 | tuple->src.u3.all[2]; | |
229 | d += (u64)tuple->dst.u3.all[1] << 32 | tuple->dst.u3.all[2]; | |
230 | ||
231 | return (u32)siphash_4u64(a, b, c, d, &nf_conntrack_hash_rnd); | |
99f07e91 CG |
232 | } |
233 | ||
56d52d48 | 234 | static u32 scale_hash(u32 hash) |
99f07e91 | 235 | { |
56d52d48 | 236 | return reciprocal_scale(hash, nf_conntrack_htable_size); |
99f07e91 | 237 | } |
0794935e | 238 | |
1b8c8a9f FW |
239 | static u32 __hash_conntrack(const struct net *net, |
240 | const struct nf_conntrack_tuple *tuple, | |
b16ac3c4 | 241 | unsigned int zoneid, |
1b8c8a9f | 242 | unsigned int size) |
99f07e91 | 243 | { |
b16ac3c4 | 244 | return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size); |
9fb9cbb1 YK |
245 | } |
246 | ||
1b8c8a9f | 247 | static u32 hash_conntrack(const struct net *net, |
b16ac3c4 FW |
248 | const struct nf_conntrack_tuple *tuple, |
249 | unsigned int zoneid) | |
9fb9cbb1 | 250 | { |
b16ac3c4 | 251 | return scale_hash(hash_conntrack_raw(tuple, zoneid, net)); |
9fb9cbb1 YK |
252 | } |
253 | ||
e2f7cc72 FW |
254 | static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, |
255 | unsigned int dataoff, | |
256 | struct nf_conntrack_tuple *tuple) | |
257 | { struct { | |
258 | __be16 sport; | |
259 | __be16 dport; | |
260 | } _inet_hdr, *inet_hdr; | |
261 | ||
262 | /* Actually only need first 4 bytes to get ports. */ | |
263 | inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); | |
264 | if (!inet_hdr) | |
265 | return false; | |
266 | ||
267 | tuple->src.u.udp.port = inet_hdr->sport; | |
268 | tuple->dst.u.udp.port = inet_hdr->dport; | |
269 | return true; | |
270 | } | |
271 | ||
60e3be94 | 272 | static bool |
9fb9cbb1 YK |
273 | nf_ct_get_tuple(const struct sk_buff *skb, |
274 | unsigned int nhoff, | |
275 | unsigned int dataoff, | |
276 | u_int16_t l3num, | |
277 | u_int8_t protonum, | |
a31f1adc | 278 | struct net *net, |
303e0c55 | 279 | struct nf_conntrack_tuple *tuple) |
9fb9cbb1 | 280 | { |
47a91b14 FW |
281 | unsigned int size; |
282 | const __be32 *ap; | |
283 | __be32 _addrs[8]; | |
284 | ||
443a70d5 | 285 | memset(tuple, 0, sizeof(*tuple)); |
9fb9cbb1 YK |
286 | |
287 | tuple->src.l3num = l3num; | |
47a91b14 FW |
288 | switch (l3num) { |
289 | case NFPROTO_IPV4: | |
290 | nhoff += offsetof(struct iphdr, saddr); | |
291 | size = 2 * sizeof(__be32); | |
292 | break; | |
293 | case NFPROTO_IPV6: | |
294 | nhoff += offsetof(struct ipv6hdr, saddr); | |
295 | size = sizeof(_addrs); | |
296 | break; | |
297 | default: | |
298 | return true; | |
299 | } | |
300 | ||
301 | ap = skb_header_pointer(skb, nhoff, size, _addrs); | |
302 | if (!ap) | |
5f2b4c90 | 303 | return false; |
9fb9cbb1 | 304 | |
47a91b14 FW |
305 | switch (l3num) { |
306 | case NFPROTO_IPV4: | |
307 | tuple->src.u3.ip = ap[0]; | |
308 | tuple->dst.u3.ip = ap[1]; | |
309 | break; | |
310 | case NFPROTO_IPV6: | |
311 | memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); | |
312 | memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); | |
313 | break; | |
314 | } | |
315 | ||
9fb9cbb1 YK |
316 | tuple->dst.protonum = protonum; |
317 | tuple->dst.dir = IP_CT_DIR_ORIGINAL; | |
318 | ||
e2e48b47 | 319 | switch (protonum) { |
81e01647 | 320 | #if IS_ENABLED(CONFIG_IPV6) |
e2e48b47 FW |
321 | case IPPROTO_ICMPV6: |
322 | return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple); | |
81e01647 | 323 | #endif |
e2e48b47 FW |
324 | case IPPROTO_ICMP: |
325 | return icmp_pkt_to_tuple(skb, dataoff, net, tuple); | |
df5e1629 FW |
326 | #ifdef CONFIG_NF_CT_PROTO_GRE |
327 | case IPPROTO_GRE: | |
328 | return gre_pkt_to_tuple(skb, dataoff, net, tuple); | |
329 | #endif | |
e2f7cc72 | 330 | case IPPROTO_TCP: |
6be79156 | 331 | case IPPROTO_UDP: |
e2f7cc72 FW |
332 | #ifdef CONFIG_NF_CT_PROTO_UDPLITE |
333 | case IPPROTO_UDPLITE: | |
e2f7cc72 FW |
334 | #endif |
335 | #ifdef CONFIG_NF_CT_PROTO_SCTP | |
336 | case IPPROTO_SCTP: | |
e2f7cc72 FW |
337 | #endif |
338 | #ifdef CONFIG_NF_CT_PROTO_DCCP | |
339 | case IPPROTO_DCCP: | |
e2f7cc72 | 340 | #endif |
6be79156 JL |
341 | /* fallthrough */ |
342 | return nf_ct_get_tuple_ports(skb, dataoff, tuple); | |
e2f7cc72 FW |
343 | default: |
344 | break; | |
e2e48b47 | 345 | } |
97e08cae | 346 | |
97e08cae | 347 | return true; |
9fb9cbb1 YK |
348 | } |
349 | ||
6816d931 FW |
350 | static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, |
351 | u_int8_t *protonum) | |
352 | { | |
353 | int dataoff = -1; | |
6816d931 FW |
354 | const struct iphdr *iph; |
355 | struct iphdr _iph; | |
356 | ||
357 | iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); | |
358 | if (!iph) | |
359 | return -1; | |
360 | ||
361 | /* Conntrack defragments packets, we might still see fragments | |
362 | * inside ICMP packets though. | |
363 | */ | |
364 | if (iph->frag_off & htons(IP_OFFSET)) | |
365 | return -1; | |
366 | ||
367 | dataoff = nhoff + (iph->ihl << 2); | |
368 | *protonum = iph->protocol; | |
369 | ||
370 | /* Check bogus IP headers */ | |
371 | if (dataoff > skb->len) { | |
372 | pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n", | |
373 | nhoff, iph->ihl << 2, skb->len); | |
374 | return -1; | |
375 | } | |
6816d931 FW |
376 | return dataoff; |
377 | } | |
378 | ||
a0ae2562 | 379 | #if IS_ENABLED(CONFIG_IPV6) |
6816d931 FW |
380 | static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, |
381 | u8 *protonum) | |
382 | { | |
383 | int protoff = -1; | |
6816d931 FW |
384 | unsigned int extoff = nhoff + sizeof(struct ipv6hdr); |
385 | __be16 frag_off; | |
386 | u8 nexthdr; | |
387 | ||
388 | if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), | |
389 | &nexthdr, sizeof(nexthdr)) != 0) { | |
390 | pr_debug("can't get nexthdr\n"); | |
391 | return -1; | |
392 | } | |
393 | protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); | |
394 | /* | |
395 | * (protoff == skb->len) means the packet has not data, just | |
396 | * IPv6 and possibly extensions headers, but it is tracked anyway | |
397 | */ | |
398 | if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { | |
399 | pr_debug("can't find proto in pkt\n"); | |
400 | return -1; | |
401 | } | |
402 | ||
403 | *protonum = nexthdr; | |
6816d931 FW |
404 | return protoff; |
405 | } | |
a0ae2562 | 406 | #endif |
6816d931 FW |
407 | |
408 | static int get_l4proto(const struct sk_buff *skb, | |
409 | unsigned int nhoff, u8 pf, u8 *l4num) | |
410 | { | |
411 | switch (pf) { | |
412 | case NFPROTO_IPV4: | |
413 | return ipv4_get_l4proto(skb, nhoff, l4num); | |
a0ae2562 | 414 | #if IS_ENABLED(CONFIG_IPV6) |
6816d931 FW |
415 | case NFPROTO_IPV6: |
416 | return ipv6_get_l4proto(skb, nhoff, l4num); | |
a0ae2562 | 417 | #endif |
6816d931 FW |
418 | default: |
419 | *l4num = 0; | |
420 | break; | |
421 | } | |
422 | return -1; | |
9fb9cbb1 YK |
423 | } |
424 | ||
5f2b4c90 | 425 | bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, |
a31f1adc EB |
426 | u_int16_t l3num, |
427 | struct net *net, struct nf_conntrack_tuple *tuple) | |
e2a3123f | 428 | { |
6816d931 FW |
429 | u8 protonum; |
430 | int protoff; | |
e2a3123f | 431 | |
6816d931 | 432 | protoff = get_l4proto(skb, nhoff, l3num, &protonum); |
303e0c55 | 433 | if (protoff <= 0) |
5f2b4c90 | 434 | return false; |
e2a3123f | 435 | |
303e0c55 | 436 | return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple); |
e2a3123f YK |
437 | } |
438 | EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); | |
439 | ||
5f2b4c90 | 440 | bool |
9fb9cbb1 | 441 | nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, |
303e0c55 | 442 | const struct nf_conntrack_tuple *orig) |
9fb9cbb1 | 443 | { |
443a70d5 | 444 | memset(inverse, 0, sizeof(*inverse)); |
9fb9cbb1 YK |
445 | |
446 | inverse->src.l3num = orig->src.l3num; | |
d1b6fe94 FW |
447 | |
448 | switch (orig->src.l3num) { | |
449 | case NFPROTO_IPV4: | |
450 | inverse->src.u3.ip = orig->dst.u3.ip; | |
451 | inverse->dst.u3.ip = orig->src.u3.ip; | |
452 | break; | |
453 | case NFPROTO_IPV6: | |
454 | inverse->src.u3.in6 = orig->dst.u3.in6; | |
455 | inverse->dst.u3.in6 = orig->src.u3.in6; | |
456 | break; | |
457 | default: | |
458 | break; | |
459 | } | |
9fb9cbb1 YK |
460 | |
461 | inverse->dst.dir = !orig->dst.dir; | |
462 | ||
463 | inverse->dst.protonum = orig->dst.protonum; | |
8b3892ea | 464 | |
197c4300 FW |
465 | switch (orig->dst.protonum) { |
466 | case IPPROTO_ICMP: | |
467 | return nf_conntrack_invert_icmp_tuple(inverse, orig); | |
81e01647 | 468 | #if IS_ENABLED(CONFIG_IPV6) |
197c4300 FW |
469 | case IPPROTO_ICMPV6: |
470 | return nf_conntrack_invert_icmpv6_tuple(inverse, orig); | |
81e01647 | 471 | #endif |
197c4300 | 472 | } |
8b3892ea FW |
473 | |
474 | inverse->src.u.all = orig->dst.u.all; | |
475 | inverse->dst.u.all = orig->src.u.all; | |
476 | return true; | |
9fb9cbb1 | 477 | } |
13b18339 | 478 | EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); |
9fb9cbb1 | 479 | |
3c791076 FW |
480 | /* Generate a almost-unique pseudo-id for a given conntrack. |
481 | * | |
482 | * intentionally doesn't re-use any of the seeds used for hash | |
483 | * table location, we assume id gets exposed to userspace. | |
484 | * | |
485 | * Following nf_conn items do not change throughout lifetime | |
656c8e9c | 486 | * of the nf_conn: |
3c791076 FW |
487 | * |
488 | * 1. nf_conn address | |
656c8e9c DM |
489 | * 2. nf_conn->master address (normally NULL) |
490 | * 3. the associated net namespace | |
491 | * 4. the original direction tuple | |
3c791076 FW |
492 | */ |
493 | u32 nf_ct_get_id(const struct nf_conn *ct) | |
494 | { | |
49ecc2e9 | 495 | static siphash_aligned_key_t ct_id_seed; |
3c791076 FW |
496 | unsigned long a, b, c, d; |
497 | ||
498 | net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); | |
499 | ||
500 | a = (unsigned long)ct; | |
656c8e9c DM |
501 | b = (unsigned long)ct->master; |
502 | c = (unsigned long)nf_ct_net(ct); | |
503 | d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, | |
504 | sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple), | |
3c791076 FW |
505 | &ct_id_seed); |
506 | #ifdef CONFIG_64BIT | |
507 | return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); | |
508 | #else | |
509 | return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed); | |
510 | #endif | |
511 | } | |
512 | EXPORT_SYMBOL_GPL(nf_ct_get_id); | |
513 | ||
9fb9cbb1 YK |
514 | static void |
515 | clean_from_lists(struct nf_conn *ct) | |
516 | { | |
ea781f19 ED |
517 | hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); |
518 | hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); | |
9fb9cbb1 YK |
519 | |
520 | /* Destroy all pending expectations */ | |
c1d10adb | 521 | nf_ct_remove_expectations(ct); |
9fb9cbb1 YK |
522 | } |
523 | ||
30322309 FW |
524 | #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) |
525 | ||
6ae7989c | 526 | /* Released via nf_ct_destroy() */ |
308ac914 DB |
527 | struct nf_conn *nf_ct_tmpl_alloc(struct net *net, |
528 | const struct nf_conntrack_zone *zone, | |
529 | gfp_t flags) | |
0838aa7f | 530 | { |
30322309 | 531 | struct nf_conn *tmpl, *p; |
0838aa7f | 532 | |
30322309 FW |
533 | if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) { |
534 | tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags); | |
535 | if (!tmpl) | |
536 | return NULL; | |
537 | ||
538 | p = tmpl; | |
539 | tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); | |
540 | if (tmpl != p) { | |
541 | tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); | |
542 | tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; | |
543 | } | |
544 | } else { | |
545 | tmpl = kzalloc(sizeof(*tmpl), flags); | |
546 | if (!tmpl) | |
547 | return NULL; | |
548 | } | |
0838aa7f PNA |
549 | |
550 | tmpl->status = IPS_TEMPLATE; | |
551 | write_pnet(&tmpl->ct_net, net); | |
6c8dee98 | 552 | nf_ct_zone_add(tmpl, zone); |
71977437 | 553 | refcount_set(&tmpl->ct_general.use, 1); |
0838aa7f PNA |
554 | |
555 | return tmpl; | |
0838aa7f PNA |
556 | } |
557 | EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); | |
558 | ||
9cf94eab | 559 | void nf_ct_tmpl_free(struct nf_conn *tmpl) |
0838aa7f | 560 | { |
1bc91a5d | 561 | kfree(tmpl->ext); |
30322309 FW |
562 | |
563 | if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) | |
564 | kfree((char *)tmpl - tmpl->proto.tmpl_padto); | |
565 | else | |
566 | kfree(tmpl); | |
0838aa7f | 567 | } |
9cf94eab | 568 | EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); |
0838aa7f | 569 | |
e5689435 FW |
570 | static void destroy_gre_conntrack(struct nf_conn *ct) |
571 | { | |
81e01647 | 572 | #ifdef CONFIG_NF_CT_PROTO_GRE |
e5689435 FW |
573 | struct nf_conn *master = ct->master; |
574 | ||
575 | if (master) | |
576 | nf_ct_gre_keymap_destroy(master); | |
81e01647 | 577 | #endif |
e5689435 FW |
578 | } |
579 | ||
6ae7989c | 580 | void nf_ct_destroy(struct nf_conntrack *nfct) |
9fb9cbb1 YK |
581 | { |
582 | struct nf_conn *ct = (struct nf_conn *)nfct; | |
9fb9cbb1 | 583 | |
71977437 | 584 | WARN_ON(refcount_read(&nfct->use) != 0); |
9fb9cbb1 | 585 | |
0838aa7f PNA |
586 | if (unlikely(nf_ct_is_template(ct))) { |
587 | nf_ct_tmpl_free(ct); | |
588 | return; | |
589 | } | |
e5689435 FW |
590 | |
591 | if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE)) | |
592 | destroy_gre_conntrack(ct); | |
9fb9cbb1 | 593 | |
9fb9cbb1 YK |
594 | /* Expectations will have been removed in clean_from_lists, |
595 | * except TFTP can create an expectation on the first packet, | |
596 | * before connection is in the list, so we need to clean here, | |
ca7433df JDB |
597 | * too. |
598 | */ | |
c1d10adb | 599 | nf_ct_remove_expectations(ct); |
9fb9cbb1 | 600 | |
9fb9cbb1 YK |
601 | if (ct->master) |
602 | nf_ct_put(ct->master); | |
603 | ||
9fb9cbb1 YK |
604 | nf_conntrack_free(ct); |
605 | } | |
6ae7989c | 606 | EXPORT_SYMBOL(nf_ct_destroy); |
9fb9cbb1 | 607 | |
2ed3bf18 | 608 | static void __nf_ct_delete_from_lists(struct nf_conn *ct) |
9fb9cbb1 | 609 | { |
0d55af87 | 610 | struct net *net = nf_ct_net(ct); |
93bb0ceb | 611 | unsigned int hash, reply_hash; |
93bb0ceb | 612 | unsigned int sequence; |
9fb9cbb1 | 613 | |
93bb0ceb | 614 | do { |
a3efd812 | 615 | sequence = read_seqcount_begin(&nf_conntrack_generation); |
deedb590 | 616 | hash = hash_conntrack(net, |
b16ac3c4 FW |
617 | &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, |
618 | nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); | |
deedb590 | 619 | reply_hash = hash_conntrack(net, |
b16ac3c4 FW |
620 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple, |
621 | nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); | |
93bb0ceb JDB |
622 | } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); |
623 | ||
9fb9cbb1 | 624 | clean_from_lists(ct); |
93bb0ceb | 625 | nf_conntrack_double_unlock(hash, reply_hash); |
2ed3bf18 FW |
626 | } |
627 | ||
628 | static void nf_ct_delete_from_lists(struct nf_conn *ct) | |
629 | { | |
630 | nf_ct_helper_destroy(ct); | |
631 | local_bh_disable(); | |
93bb0ceb | 632 | |
2ed3bf18 | 633 | __nf_ct_delete_from_lists(ct); |
93bb0ceb | 634 | |
93bb0ceb | 635 | local_bh_enable(); |
dd7669a9 | 636 | } |
dd7669a9 | 637 | |
2ed3bf18 FW |
638 | static void nf_ct_add_to_ecache_list(struct nf_conn *ct) |
639 | { | |
640 | #ifdef CONFIG_NF_CONNTRACK_EVENTS | |
641 | struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct)); | |
642 | ||
643 | spin_lock(&cnet->ecache.dying_lock); | |
644 | hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, | |
645 | &cnet->ecache.dying_list); | |
646 | spin_unlock(&cnet->ecache.dying_lock); | |
2ed3bf18 FW |
647 | #endif |
648 | } | |
649 | ||
02982c27 | 650 | bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) |
dd7669a9 | 651 | { |
a992ca2a | 652 | struct nf_conn_tstamp *tstamp; |
1379940b | 653 | struct net *net; |
a992ca2a | 654 | |
f330a7fd FW |
655 | if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) |
656 | return false; | |
657 | ||
a992ca2a | 658 | tstamp = nf_conn_tstamp_find(ct); |
30a56a2b | 659 | if (tstamp) { |
802a7dc5 | 660 | s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; |
30a56a2b | 661 | |
d2de875c | 662 | tstamp->stop = ktime_get_real_ns(); |
30a56a2b FW |
663 | if (timeout < 0) |
664 | tstamp->stop -= jiffies_to_nsecs(-timeout); | |
665 | } | |
dd7669a9 | 666 | |
9500507c FW |
667 | if (nf_conntrack_event_report(IPCT_DESTROY, ct, |
668 | portid, report) < 0) { | |
f330a7fd FW |
669 | /* destroy event was not delivered. nf_ct_put will |
670 | * be done by event cache worker on redelivery. | |
671 | */ | |
2ed3bf18 FW |
672 | nf_ct_helper_destroy(ct); |
673 | local_bh_disable(); | |
674 | __nf_ct_delete_from_lists(ct); | |
675 | nf_ct_add_to_ecache_list(ct); | |
676 | local_bh_enable(); | |
677 | ||
1379940b | 678 | nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL); |
02982c27 | 679 | return false; |
dd7669a9 | 680 | } |
9500507c | 681 | |
1379940b FW |
682 | net = nf_ct_net(ct); |
683 | if (nf_conntrack_ecache_dwork_pending(net)) | |
684 | nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT); | |
dd7669a9 | 685 | nf_ct_delete_from_lists(ct); |
9fb9cbb1 | 686 | nf_ct_put(ct); |
02982c27 FW |
687 | return true; |
688 | } | |
689 | EXPORT_SYMBOL_GPL(nf_ct_delete); | |
690 | ||
c6825c09 AV |
691 | static inline bool |
692 | nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, | |
308ac914 | 693 | const struct nf_conntrack_tuple *tuple, |
e0c7d472 FW |
694 | const struct nf_conntrack_zone *zone, |
695 | const struct net *net) | |
c6825c09 AV |
696 | { |
697 | struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); | |
698 | ||
699 | /* A conntrack can be recreated with the equal tuple, | |
700 | * so we need to check that the conntrack is confirmed | |
701 | */ | |
702 | return nf_ct_tuple_equal(tuple, &h->tuple) && | |
deedb590 | 703 | nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && |
e0c7d472 FW |
704 | nf_ct_is_confirmed(ct) && |
705 | net_eq(net, nf_ct_net(ct)); | |
c6825c09 AV |
706 | } |
707 | ||
ed07d9a0 MP |
708 | static inline bool |
709 | nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) | |
710 | { | |
711 | return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, | |
712 | &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && | |
713 | nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, | |
714 | &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) && | |
715 | nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) && | |
716 | nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) && | |
717 | net_eq(nf_ct_net(ct1), nf_ct_net(ct2)); | |
718 | } | |
719 | ||
f330a7fd FW |
720 | /* caller must hold rcu readlock and none of the nf_conntrack_locks */ |
721 | static void nf_ct_gc_expired(struct nf_conn *ct) | |
722 | { | |
71977437 | 723 | if (!refcount_inc_not_zero(&ct->ct_general.use)) |
f330a7fd FW |
724 | return; |
725 | ||
0ed8f619 FW |
726 | /* load ->status after refcount increase */ |
727 | smp_acquire__after_ctrl_dep(); | |
728 | ||
f330a7fd FW |
729 | if (nf_ct_should_gc(ct)) |
730 | nf_ct_kill(ct); | |
731 | ||
732 | nf_ct_put(ct); | |
733 | } | |
734 | ||
ea781f19 ED |
735 | /* |
736 | * Warning : | |
737 | * - Caller must take a reference on returned object | |
738 | * and recheck nf_ct_tuple_equal(tuple, &h->tuple) | |
ea781f19 | 739 | */ |
99f07e91 | 740 | static struct nf_conntrack_tuple_hash * |
308ac914 | 741 | ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, |
99f07e91 | 742 | const struct nf_conntrack_tuple *tuple, u32 hash) |
9fb9cbb1 YK |
743 | { |
744 | struct nf_conntrack_tuple_hash *h; | |
5e3c61f9 | 745 | struct hlist_nulls_head *ct_hash; |
ea781f19 | 746 | struct hlist_nulls_node *n; |
92e47ba8 | 747 | unsigned int bucket, hsize; |
9fb9cbb1 | 748 | |
ea781f19 | 749 | begin: |
92e47ba8 LZ |
750 | nf_conntrack_get_ht(&ct_hash, &hsize); |
751 | bucket = reciprocal_scale(hash, hsize); | |
5e3c61f9 FW |
752 | |
753 | hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { | |
f330a7fd FW |
754 | struct nf_conn *ct; |
755 | ||
756 | ct = nf_ct_tuplehash_to_ctrack(h); | |
757 | if (nf_ct_is_expired(ct)) { | |
758 | nf_ct_gc_expired(ct); | |
759 | continue; | |
760 | } | |
761 | ||
8e8118f8 | 762 | if (nf_ct_key_equal(h, tuple, zone, net)) |
9fb9cbb1 | 763 | return h; |
9fb9cbb1 | 764 | } |
ea781f19 ED |
765 | /* |
766 | * if the nulls value we got at the end of this lookup is | |
767 | * not the expected one, we must restart lookup. | |
768 | * We probably met an item that was moved to another chain. | |
769 | */ | |
99f07e91 | 770 | if (get_nulls_value(n) != bucket) { |
2cf12348 | 771 | NF_CT_STAT_INC_ATOMIC(net, search_restart); |
ea781f19 | 772 | goto begin; |
af740b2c | 773 | } |
9fb9cbb1 YK |
774 | |
775 | return NULL; | |
776 | } | |
99f07e91 | 777 | |
9fb9cbb1 | 778 | /* Find a connection corresponding to a tuple. */ |
99f07e91 | 779 | static struct nf_conntrack_tuple_hash * |
308ac914 | 780 | __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, |
99f07e91 | 781 | const struct nf_conntrack_tuple *tuple, u32 hash) |
9fb9cbb1 YK |
782 | { |
783 | struct nf_conntrack_tuple_hash *h; | |
76507f69 | 784 | struct nf_conn *ct; |
9fb9cbb1 | 785 | |
99f07e91 | 786 | h = ____nf_conntrack_find(net, zone, tuple, hash); |
76507f69 | 787 | if (h) { |
87e389b4 FW |
788 | /* We have a candidate that matches the tuple we're interested |
789 | * in, try to obtain a reference and re-check tuple | |
790 | */ | |
76507f69 | 791 | ct = nf_ct_tuplehash_to_ctrack(h); |
71977437 | 792 | if (likely(refcount_inc_not_zero(&ct->ct_general.use))) { |
0ed8f619 FW |
793 | /* re-check key after refcount */ |
794 | smp_acquire__after_ctrl_dep(); | |
795 | ||
87e389b4 | 796 | if (likely(nf_ct_key_equal(h, tuple, zone, net))) |
2a2fa2ef | 797 | return h; |
87e389b4 FW |
798 | |
799 | /* TYPESAFE_BY_RCU recycled the candidate */ | |
800 | nf_ct_put(ct); | |
ea781f19 | 801 | } |
87e389b4 FW |
802 | |
803 | h = NULL; | |
76507f69 | 804 | } |
9fb9cbb1 YK |
805 | |
806 | return h; | |
807 | } | |
99f07e91 CG |
808 | |
809 | struct nf_conntrack_tuple_hash * | |
308ac914 | 810 | nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, |
99f07e91 CG |
811 | const struct nf_conntrack_tuple *tuple) |
812 | { | |
b16ac3c4 FW |
813 | unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); |
814 | struct nf_conntrack_tuple_hash *thash; | |
815 | ||
2a2fa2ef FW |
816 | rcu_read_lock(); |
817 | ||
b16ac3c4 FW |
818 | thash = __nf_conntrack_find_get(net, zone, tuple, |
819 | hash_conntrack_raw(tuple, zone_id, net)); | |
820 | ||
821 | if (thash) | |
2a2fa2ef | 822 | goto out_unlock; |
b16ac3c4 FW |
823 | |
824 | rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); | |
825 | if (rid != zone_id) | |
2a2fa2ef FW |
826 | thash = __nf_conntrack_find_get(net, zone, tuple, |
827 | hash_conntrack_raw(tuple, rid, net)); | |
828 | ||
829 | out_unlock: | |
830 | rcu_read_unlock(); | |
b16ac3c4 | 831 | return thash; |
99f07e91 | 832 | } |
13b18339 | 833 | EXPORT_SYMBOL_GPL(nf_conntrack_find_get); |
9fb9cbb1 | 834 | |
c1d10adb PNA |
835 | static void __nf_conntrack_hash_insert(struct nf_conn *ct, |
836 | unsigned int hash, | |
b476b72a | 837 | unsigned int reply_hash) |
c1d10adb | 838 | { |
ea781f19 | 839 | hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, |
56d52d48 | 840 | &nf_conntrack_hash[hash]); |
ea781f19 | 841 | hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, |
56d52d48 | 842 | &nf_conntrack_hash[reply_hash]); |
c1d10adb PNA |
843 | } |
844 | ||
c56716c6 FW |
845 | static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext) |
846 | { | |
847 | /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions | |
848 | * may contain stale pointers to e.g. helper that has been removed. | |
849 | * | |
850 | * The helper can't clear this because the nf_conn object isn't in | |
851 | * any hash and synchronize_rcu() isn't enough because associated skb | |
852 | * might sit in a queue. | |
853 | */ | |
854 | return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid); | |
855 | } | |
856 | ||
857 | static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext) | |
858 | { | |
859 | if (!ext) | |
860 | return true; | |
861 | ||
862 | if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid)) | |
863 | return false; | |
864 | ||
865 | /* inserted into conntrack table, nf_ct_iterate_cleanup() | |
866 | * will find it. Disable nf_ct_ext_find() id check. | |
867 | */ | |
868 | WRITE_ONCE(ext->gen_id, 0); | |
869 | return true; | |
870 | } | |
871 | ||
7d367e06 JK |
872 | int |
873 | nf_conntrack_hash_check_insert(struct nf_conn *ct) | |
c1d10adb | 874 | { |
308ac914 | 875 | const struct nf_conntrack_zone *zone; |
d696c7bd | 876 | struct net *net = nf_ct_net(ct); |
b476b72a | 877 | unsigned int hash, reply_hash; |
7d367e06 JK |
878 | struct nf_conntrack_tuple_hash *h; |
879 | struct hlist_nulls_node *n; | |
c9c3b681 | 880 | unsigned int max_chainlen; |
d7e7747a | 881 | unsigned int chainlen = 0; |
93bb0ceb | 882 | unsigned int sequence; |
d7e7747a | 883 | int err = -EEXIST; |
c1d10adb | 884 | |
5d0aa2cc | 885 | zone = nf_ct_zone(ct); |
7d367e06 | 886 | |
e6d57e9f FW |
887 | if (!nf_ct_ext_valid_pre(ct->ext)) |
888 | return -EAGAIN; | |
c56716c6 | 889 | |
93bb0ceb JDB |
890 | local_bh_disable(); |
891 | do { | |
a3efd812 | 892 | sequence = read_seqcount_begin(&nf_conntrack_generation); |
deedb590 | 893 | hash = hash_conntrack(net, |
b16ac3c4 FW |
894 | &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, |
895 | nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); | |
deedb590 | 896 | reply_hash = hash_conntrack(net, |
b16ac3c4 FW |
897 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple, |
898 | nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); | |
93bb0ceb | 899 | } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); |
7d367e06 | 900 | |
8032bf12 | 901 | max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); |
c9c3b681 | 902 | |
7d367e06 | 903 | /* See if there's one in the list already, including reverse */ |
d7e7747a | 904 | hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { |
86804348 | 905 | if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, |
e0c7d472 | 906 | zone, net)) |
7d367e06 | 907 | goto out; |
86804348 | 908 | |
c9c3b681 | 909 | if (chainlen++ > max_chainlen) |
d7e7747a FW |
910 | goto chaintoolong; |
911 | } | |
912 | ||
913 | chainlen = 0; | |
914 | ||
915 | hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { | |
86804348 | 916 | if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, |
e0c7d472 | 917 | zone, net)) |
7d367e06 | 918 | goto out; |
c9c3b681 | 919 | if (chainlen++ > max_chainlen) |
d7e7747a FW |
920 | goto chaintoolong; |
921 | } | |
c1d10adb | 922 | |
e6d57e9f FW |
923 | /* If genid has changed, we can't insert anymore because ct |
924 | * extensions could have stale pointers and nf_ct_iterate_destroy | |
925 | * might have completed its table scan already. | |
926 | * | |
927 | * Increment of the ext genid right after this check is fine: | |
928 | * nf_ct_iterate_destroy blocks until locks are released. | |
929 | */ | |
930 | if (!nf_ct_ext_valid_post(ct->ext)) { | |
931 | err = -EAGAIN; | |
932 | goto out; | |
933 | } | |
934 | ||
e53376be PNA |
935 | smp_wmb(); |
936 | /* The caller holds a reference to this object */ | |
71977437 | 937 | refcount_set(&ct->ct_general.use, 2); |
b476b72a | 938 | __nf_conntrack_hash_insert(ct, hash, reply_hash); |
93bb0ceb | 939 | nf_conntrack_double_unlock(hash, reply_hash); |
7d367e06 | 940 | NF_CT_STAT_INC(net, insert); |
93bb0ceb | 941 | local_bh_enable(); |
c56716c6 | 942 | |
7d367e06 | 943 | return 0; |
d7e7747a FW |
944 | chaintoolong: |
945 | NF_CT_STAT_INC(net, chaintoolong); | |
946 | err = -ENOSPC; | |
7d367e06 | 947 | out: |
93bb0ceb | 948 | nf_conntrack_double_unlock(hash, reply_hash); |
93bb0ceb | 949 | local_bh_enable(); |
d7e7747a | 950 | return err; |
c1d10adb | 951 | } |
7d367e06 | 952 | EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); |
c1d10adb | 953 | |
9312eaba | 954 | void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets, |
955 | unsigned int bytes) | |
ba76738c PNA |
956 | { |
957 | struct nf_conn_acct *acct; | |
958 | ||
959 | acct = nf_conn_acct_find(ct); | |
960 | if (acct) { | |
961 | struct nf_conn_counter *counter = acct->counter; | |
962 | ||
9312eaba | 963 | atomic64_add(packets, &counter[dir].packets); |
8ac2bd35 | 964 | atomic64_add(bytes, &counter[dir].bytes); |
ba76738c PNA |
965 | } |
966 | } | |
9312eaba | 967 | EXPORT_SYMBOL_GPL(nf_ct_acct_add); |
ba76738c | 968 | |
71d8c47f PNA |
969 | static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, |
970 | const struct nf_conn *loser_ct) | |
971 | { | |
972 | struct nf_conn_acct *acct; | |
973 | ||
974 | acct = nf_conn_acct_find(loser_ct); | |
975 | if (acct) { | |
976 | struct nf_conn_counter *counter = acct->counter; | |
71d8c47f PNA |
977 | unsigned int bytes; |
978 | ||
979 | /* u32 should be fine since we must have seen one packet. */ | |
980 | bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); | |
8ac2bd35 | 981 | nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); |
71d8c47f PNA |
982 | } |
983 | } | |
984 | ||
b1b32552 FW |
985 | static void __nf_conntrack_insert_prepare(struct nf_conn *ct) |
986 | { | |
987 | struct nf_conn_tstamp *tstamp; | |
988 | ||
71977437 | 989 | refcount_inc(&ct->ct_general.use); |
b1b32552 FW |
990 | |
991 | /* set conntrack timestamp, if enabled. */ | |
992 | tstamp = nf_conn_tstamp_find(ct); | |
993 | if (tstamp) | |
994 | tstamp->start = ktime_get_real_ns(); | |
995 | } | |
996 | ||
ff73e747 | 997 | /* caller must hold locks to prevent concurrent changes */ |
bb89abe5 FW |
998 | static int __nf_ct_resolve_clash(struct sk_buff *skb, |
999 | struct nf_conntrack_tuple_hash *h) | |
1000 | { | |
1001 | /* This is the conntrack entry already in hashes that won race. */ | |
1002 | struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); | |
1003 | enum ip_conntrack_info ctinfo; | |
1004 | struct nf_conn *loser_ct; | |
1005 | ||
1006 | loser_ct = nf_ct_get(skb, &ctinfo); | |
1007 | ||
1008 | if (nf_ct_is_dying(ct)) | |
1009 | return NF_DROP; | |
1010 | ||
bb89abe5 FW |
1011 | if (((ct->status & IPS_NAT_DONE_MASK) == 0) || |
1012 | nf_ct_match(ct, loser_ct)) { | |
1013 | struct net *net = nf_ct_net(ct); | |
1014 | ||
ff73e747 FW |
1015 | nf_conntrack_get(&ct->ct_general); |
1016 | ||
bb89abe5 | 1017 | nf_ct_acct_merge(ct, ctinfo, loser_ct); |
408bdcfc | 1018 | nf_ct_put(loser_ct); |
bb89abe5 FW |
1019 | nf_ct_set(skb, ct, ctinfo); |
1020 | ||
bc924704 | 1021 | NF_CT_STAT_INC(net, clash_resolve); |
bb89abe5 FW |
1022 | return NF_ACCEPT; |
1023 | } | |
1024 | ||
bb89abe5 FW |
1025 | return NF_DROP; |
1026 | } | |
1027 | ||
6a757c07 FW |
1028 | /** |
1029 | * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry | |
1030 | * | |
1031 | * @skb: skb that causes the collision | |
1032 | * @repl_idx: hash slot for reply direction | |
1033 | * | |
1034 | * Called when origin or reply direction had a clash. | |
1035 | * The skb can be handled without packet drop provided the reply direction | |
1036 | * is unique or there the existing entry has the identical tuple in both | |
1037 | * directions. | |
1038 | * | |
1039 | * Caller must hold conntrack table locks to prevent concurrent updates. | |
1040 | * | |
1041 | * Returns NF_DROP if the clash could not be handled. | |
1042 | */ | |
1043 | static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) | |
1044 | { | |
1045 | struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb); | |
1046 | const struct nf_conntrack_zone *zone; | |
1047 | struct nf_conntrack_tuple_hash *h; | |
1048 | struct hlist_nulls_node *n; | |
1049 | struct net *net; | |
1050 | ||
1051 | zone = nf_ct_zone(loser_ct); | |
1052 | net = nf_ct_net(loser_ct); | |
1053 | ||
1054 | /* Reply direction must never result in a clash, unless both origin | |
1055 | * and reply tuples are identical. | |
1056 | */ | |
1057 | hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) { | |
1058 | if (nf_ct_key_equal(h, | |
1059 | &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple, | |
1060 | zone, net)) | |
1061 | return __nf_ct_resolve_clash(skb, h); | |
1062 | } | |
1063 | ||
1064 | /* We want the clashing entry to go away real soon: 1 second timeout. */ | |
802a7dc5 | 1065 | WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ); |
6a757c07 FW |
1066 | |
1067 | /* IPS_NAT_CLASH removes the entry automatically on the first | |
1068 | * reply. Also prevents UDP tracker from moving the entry to | |
1069 | * ASSURED state, i.e. the entry can always be evicted under | |
1070 | * pressure. | |
1071 | */ | |
1072 | loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH; | |
1073 | ||
1074 | __nf_conntrack_insert_prepare(loser_ct); | |
1075 | ||
1076 | /* fake add for ORIGINAL dir: we want lookups to only find the entry | |
1077 | * already in the table. This also hides the clashing entry from | |
1078 | * ctnetlink iteration, i.e. conntrack -L won't show them. | |
1079 | */ | |
1080 | hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); | |
1081 | ||
1082 | hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, | |
1083 | &nf_conntrack_hash[repl_idx]); | |
bc924704 FW |
1084 | |
1085 | NF_CT_STAT_INC(net, clash_resolve); | |
6a757c07 FW |
1086 | return NF_ACCEPT; |
1087 | } | |
1088 | ||
3d1e0b40 FW |
1089 | /** |
1090 | * nf_ct_resolve_clash - attempt to handle clash without packet drop | |
1091 | * | |
1092 | * @skb: skb that causes the clash | |
1093 | * @h: tuplehash of the clashing entry already in table | |
3db86c39 | 1094 | * @reply_hash: hash slot for reply direction |
3d1e0b40 FW |
1095 | * |
1096 | * A conntrack entry can be inserted to the connection tracking table | |
1097 | * if there is no existing entry with an identical tuple. | |
1098 | * | |
1099 | * If there is one, @skb (and the assocated, unconfirmed conntrack) has | |
1100 | * to be dropped. In case @skb is retransmitted, next conntrack lookup | |
1101 | * will find the already-existing entry. | |
1102 | * | |
1103 | * The major problem with such packet drop is the extra delay added by | |
1104 | * the packet loss -- it will take some time for a retransmit to occur | |
1105 | * (or the sender to time out when waiting for a reply). | |
1106 | * | |
1107 | * This function attempts to handle the situation without packet drop. | |
1108 | * | |
1109 | * If @skb has no NAT transformation or if the colliding entries are | |
1110 | * exactly the same, only the to-be-confirmed conntrack entry is discarded | |
1111 | * and @skb is associated with the conntrack entry already in the table. | |
1112 | * | |
6a757c07 FW |
1113 | * Failing that, the new, unconfirmed conntrack is still added to the table |
1114 | * provided that the collision only occurs in the ORIGINAL direction. | |
bc924704 | 1115 | * The new entry will be added only in the non-clashing REPLY direction, |
6a757c07 FW |
1116 | * so packets in the ORIGINAL direction will continue to match the existing |
1117 | * entry. The new entry will also have a fixed timeout so it expires -- | |
bc924704 | 1118 | * due to the collision, it will only see reply traffic. |
6a757c07 | 1119 | * |
3d1e0b40 FW |
1120 | * Returns NF_DROP if the clash could not be resolved. |
1121 | */ | |
c7c17e6a | 1122 | static __cold noinline int |
6a757c07 FW |
1123 | nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, |
1124 | u32 reply_hash) | |
71d8c47f PNA |
1125 | { |
1126 | /* This is the conntrack entry already in hashes that won race. */ | |
1127 | struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); | |
b3480fe0 | 1128 | const struct nf_conntrack_l4proto *l4proto; |
3d1e0b40 FW |
1129 | enum ip_conntrack_info ctinfo; |
1130 | struct nf_conn *loser_ct; | |
1131 | struct net *net; | |
bb89abe5 | 1132 | int ret; |
3d1e0b40 FW |
1133 | |
1134 | loser_ct = nf_ct_get(skb, &ctinfo); | |
bb89abe5 | 1135 | net = nf_ct_net(loser_ct); |
71d8c47f | 1136 | |
4a60dc74 | 1137 | l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); |
3d1e0b40 FW |
1138 | if (!l4proto->allow_clash) |
1139 | goto drop; | |
1140 | ||
bb89abe5 FW |
1141 | ret = __nf_ct_resolve_clash(skb, h); |
1142 | if (ret == NF_ACCEPT) | |
1143 | return ret; | |
3d1e0b40 | 1144 | |
6a757c07 FW |
1145 | ret = nf_ct_resolve_clash_harder(skb, reply_hash); |
1146 | if (ret == NF_ACCEPT) | |
1147 | return ret; | |
1148 | ||
3d1e0b40 | 1149 | drop: |
71d8c47f | 1150 | NF_CT_STAT_INC(net, drop); |
bb89abe5 | 1151 | NF_CT_STAT_INC(net, insert_failed); |
71d8c47f PNA |
1152 | return NF_DROP; |
1153 | } | |
1154 | ||
9fb9cbb1 YK |
1155 | /* Confirm a connection given skb; places it in hash table */ |
1156 | int | |
3db05fea | 1157 | __nf_conntrack_confirm(struct sk_buff *skb) |
9fb9cbb1 | 1158 | { |
c9c3b681 | 1159 | unsigned int chainlen = 0, sequence, max_chainlen; |
308ac914 | 1160 | const struct nf_conntrack_zone *zone; |
b476b72a | 1161 | unsigned int hash, reply_hash; |
df0933dc | 1162 | struct nf_conntrack_tuple_hash *h; |
9fb9cbb1 | 1163 | struct nf_conn *ct; |
df0933dc | 1164 | struct nf_conn_help *help; |
ea781f19 | 1165 | struct hlist_nulls_node *n; |
9fb9cbb1 | 1166 | enum ip_conntrack_info ctinfo; |
400dad39 | 1167 | struct net *net; |
71d8c47f | 1168 | int ret = NF_DROP; |
9fb9cbb1 | 1169 | |
3db05fea | 1170 | ct = nf_ct_get(skb, &ctinfo); |
400dad39 | 1171 | net = nf_ct_net(ct); |
9fb9cbb1 YK |
1172 | |
1173 | /* ipt_REJECT uses nf_conntrack_attach to attach related | |
1174 | ICMP/TCP RST packets in other direction. Actual packet | |
1175 | which created connection will be IP_CT_NEW or for an | |
1176 | expected connection, IP_CT_RELATED. */ | |
1177 | if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) | |
1178 | return NF_ACCEPT; | |
1179 | ||
5d0aa2cc | 1180 | zone = nf_ct_zone(ct); |
93bb0ceb JDB |
1181 | local_bh_disable(); |
1182 | ||
1183 | do { | |
a3efd812 | 1184 | sequence = read_seqcount_begin(&nf_conntrack_generation); |
93bb0ceb JDB |
1185 | /* reuse the hash saved before */ |
1186 | hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; | |
56d52d48 | 1187 | hash = scale_hash(hash); |
deedb590 | 1188 | reply_hash = hash_conntrack(net, |
b16ac3c4 FW |
1189 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple, |
1190 | nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); | |
93bb0ceb | 1191 | } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); |
9fb9cbb1 YK |
1192 | |
1193 | /* We're not in hash table, and we refuse to set up related | |
93bb0ceb JDB |
1194 | * connections for unconfirmed conns. But packet copies and |
1195 | * REJECT will give spurious warnings here. | |
1196 | */ | |
9fb9cbb1 | 1197 | |
13f5251f CMW |
1198 | /* Another skb with the same unconfirmed conntrack may |
1199 | * win the race. This may happen for bridge(br_flood) | |
1200 | * or broadcast/multicast packets do skb_clone with | |
1201 | * unconfirmed conntrack. | |
93bb0ceb | 1202 | */ |
13f5251f CMW |
1203 | if (unlikely(nf_ct_is_confirmed(ct))) { |
1204 | WARN_ON_ONCE(1); | |
1205 | nf_conntrack_double_unlock(hash, reply_hash); | |
1206 | local_bh_enable(); | |
1207 | return NF_DROP; | |
1208 | } | |
1209 | ||
c56716c6 FW |
1210 | if (!nf_ct_ext_valid_pre(ct->ext)) { |
1211 | NF_CT_STAT_INC(net, insert_failed); | |
1212 | goto dying; | |
1213 | } | |
1214 | ||
8ca3f5e9 PNA |
1215 | /* We have to check the DYING flag after unlink to prevent |
1216 | * a race against nf_ct_get_next_corpse() possibly called from | |
1217 | * user context, else we insert an already 'dead' hash, blocking | |
1218 | * further use of that particular connection -JM. | |
1219 | */ | |
1397af5b | 1220 | ct->status |= IPS_CONFIRMED; |
8ca3f5e9 | 1221 | |
71d8c47f | 1222 | if (unlikely(nf_ct_is_dying(ct))) { |
bb89abe5 | 1223 | NF_CT_STAT_INC(net, insert_failed); |
71d8c47f PNA |
1224 | goto dying; |
1225 | } | |
fc350777 | 1226 | |
8032bf12 | 1227 | max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); |
9fb9cbb1 YK |
1228 | /* See if there's one in the list already, including reverse: |
1229 | NAT could have grabbed it without realizing, since we're | |
1230 | not in the hash. If there is, we lost race. */ | |
d7e7747a | 1231 | hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { |
86804348 | 1232 | if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, |
e0c7d472 | 1233 | zone, net)) |
df0933dc | 1234 | goto out; |
c9c3b681 | 1235 | if (chainlen++ > max_chainlen) |
d7e7747a FW |
1236 | goto chaintoolong; |
1237 | } | |
86804348 | 1238 | |
d7e7747a FW |
1239 | chainlen = 0; |
1240 | hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { | |
86804348 | 1241 | if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, |
e0c7d472 | 1242 | zone, net)) |
df0933dc | 1243 | goto out; |
c9c3b681 | 1244 | if (chainlen++ > max_chainlen) { |
d7e7747a | 1245 | chaintoolong: |
d7e7747a FW |
1246 | NF_CT_STAT_INC(net, chaintoolong); |
1247 | NF_CT_STAT_INC(net, insert_failed); | |
1248 | ret = NF_DROP; | |
1249 | goto dying; | |
1250 | } | |
1251 | } | |
9fb9cbb1 | 1252 | |
df0933dc PM |
1253 | /* Timer relative to confirmation time, not original |
1254 | setting time, otherwise we'd get timer wrap in | |
1255 | weird delay cases. */ | |
f330a7fd | 1256 | ct->timeout += nfct_time_stamp; |
5c8ec910 | 1257 | |
b1b32552 | 1258 | __nf_conntrack_insert_prepare(ct); |
a992ca2a | 1259 | |
5c8ec910 PM |
1260 | /* Since the lookup is lockless, hash insertion must be done after |
1261 | * starting the timer and setting the CONFIRMED bit. The RCU barriers | |
1262 | * guarantee that no other CPU can find the conntrack before the above | |
1263 | * stores are visible. | |
1264 | */ | |
b476b72a | 1265 | __nf_conntrack_hash_insert(ct, hash, reply_hash); |
93bb0ceb | 1266 | nf_conntrack_double_unlock(hash, reply_hash); |
93bb0ceb | 1267 | local_bh_enable(); |
5c8ec910 | 1268 | |
c56716c6 FW |
1269 | /* ext area is still valid (rcu read lock is held, |
1270 | * but will go out of scope soon, we need to remove | |
1271 | * this conntrack again. | |
1272 | */ | |
1273 | if (!nf_ct_ext_valid_post(ct->ext)) { | |
1274 | nf_ct_kill(ct); | |
9464d0b6 | 1275 | NF_CT_STAT_INC_ATOMIC(net, drop); |
c56716c6 FW |
1276 | return NF_DROP; |
1277 | } | |
1278 | ||
df0933dc PM |
1279 | help = nfct_help(ct); |
1280 | if (help && help->helper) | |
a71996fc | 1281 | nf_conntrack_event_cache(IPCT_HELPER, ct); |
17e6e4ea | 1282 | |
df0933dc | 1283 | nf_conntrack_event_cache(master_ct(ct) ? |
a71996fc | 1284 | IPCT_RELATED : IPCT_NEW, ct); |
df0933dc | 1285 | return NF_ACCEPT; |
9fb9cbb1 | 1286 | |
df0933dc | 1287 | out: |
6a757c07 | 1288 | ret = nf_ct_resolve_clash(skb, h, reply_hash); |
71d8c47f | 1289 | dying: |
93bb0ceb | 1290 | nf_conntrack_double_unlock(hash, reply_hash); |
93bb0ceb | 1291 | local_bh_enable(); |
71d8c47f | 1292 | return ret; |
9fb9cbb1 | 1293 | } |
13b18339 | 1294 | EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); |
9fb9cbb1 | 1295 | |
e5d015a1 | 1296 | /* Returns true if a connection corresponds to the tuple (required |
9fb9cbb1 YK |
1297 | for NAT). */ |
1298 | int | |
1299 | nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, | |
1300 | const struct nf_conn *ignored_conntrack) | |
1301 | { | |
400dad39 | 1302 | struct net *net = nf_ct_net(ignored_conntrack); |
308ac914 | 1303 | const struct nf_conntrack_zone *zone; |
9fb9cbb1 | 1304 | struct nf_conntrack_tuple_hash *h; |
5e3c61f9 | 1305 | struct hlist_nulls_head *ct_hash; |
92e47ba8 | 1306 | unsigned int hash, hsize; |
ea781f19 | 1307 | struct hlist_nulls_node *n; |
5d0aa2cc | 1308 | struct nf_conn *ct; |
308ac914 DB |
1309 | |
1310 | zone = nf_ct_zone(ignored_conntrack); | |
9fb9cbb1 | 1311 | |
2cf12348 | 1312 | rcu_read_lock(); |
95a8d19f | 1313 | begin: |
92e47ba8 | 1314 | nf_conntrack_get_ht(&ct_hash, &hsize); |
b16ac3c4 | 1315 | hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize); |
5e3c61f9 FW |
1316 | |
1317 | hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { | |
5d0aa2cc | 1318 | ct = nf_ct_tuplehash_to_ctrack(h); |
f330a7fd FW |
1319 | |
1320 | if (ct == ignored_conntrack) | |
1321 | continue; | |
1322 | ||
1323 | if (nf_ct_is_expired(ct)) { | |
1324 | nf_ct_gc_expired(ct); | |
1325 | continue; | |
1326 | } | |
1327 | ||
1328 | if (nf_ct_key_equal(h, tuple, zone, net)) { | |
4e35c1cb MP |
1329 | /* Tuple is taken already, so caller will need to find |
1330 | * a new source port to use. | |
1331 | * | |
1332 | * Only exception: | |
1333 | * If the *original tuples* are identical, then both | |
1334 | * conntracks refer to the same flow. | |
1335 | * This is a rare situation, it can occur e.g. when | |
1336 | * more than one UDP packet is sent from same socket | |
1337 | * in different threads. | |
1338 | * | |
1339 | * Let nf_ct_resolve_clash() deal with this later. | |
1340 | */ | |
1341 | if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, | |
07998281 FW |
1342 | &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && |
1343 | nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) | |
4e35c1cb MP |
1344 | continue; |
1345 | ||
2cf12348 FW |
1346 | NF_CT_STAT_INC_ATOMIC(net, found); |
1347 | rcu_read_unlock(); | |
ba419aff PM |
1348 | return 1; |
1349 | } | |
ba419aff | 1350 | } |
95a8d19f FW |
1351 | |
1352 | if (get_nulls_value(n) != hash) { | |
1353 | NF_CT_STAT_INC_ATOMIC(net, search_restart); | |
1354 | goto begin; | |
1355 | } | |
1356 | ||
2cf12348 | 1357 | rcu_read_unlock(); |
9fb9cbb1 | 1358 | |
ba419aff | 1359 | return 0; |
9fb9cbb1 | 1360 | } |
13b18339 | 1361 | EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); |
9fb9cbb1 | 1362 | |
7ae7730f PM |
1363 | #define NF_CT_EVICTION_RANGE 8 |
1364 | ||
9fb9cbb1 YK |
1365 | /* There's a small race here where we may free a just-assured |
1366 | connection. Too bad: we're in trouble anyway. */ | |
242922a0 FW |
1367 | static unsigned int early_drop_list(struct net *net, |
1368 | struct hlist_nulls_head *head) | |
9fb9cbb1 | 1369 | { |
9fb9cbb1 | 1370 | struct nf_conntrack_tuple_hash *h; |
ea781f19 | 1371 | struct hlist_nulls_node *n; |
242922a0 FW |
1372 | unsigned int drops = 0; |
1373 | struct nf_conn *tmp; | |
3e86638e | 1374 | |
242922a0 FW |
1375 | hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { |
1376 | tmp = nf_ct_tuplehash_to_ctrack(h); | |
9fb9cbb1 | 1377 | |
f330a7fd FW |
1378 | if (nf_ct_is_expired(tmp)) { |
1379 | nf_ct_gc_expired(tmp); | |
1380 | continue; | |
1381 | } | |
1382 | ||
242922a0 FW |
1383 | if (test_bit(IPS_ASSURED_BIT, &tmp->status) || |
1384 | !net_eq(nf_ct_net(tmp), net) || | |
1385 | nf_ct_is_dying(tmp)) | |
1386 | continue; | |
76507f69 | 1387 | |
71977437 | 1388 | if (!refcount_inc_not_zero(&tmp->ct_general.use)) |
242922a0 | 1389 | continue; |
76507f69 | 1390 | |
0ed8f619 FW |
1391 | /* load ->ct_net and ->status after refcount increase */ |
1392 | smp_acquire__after_ctrl_dep(); | |
1393 | ||
242922a0 | 1394 | /* kill only if still in same netns -- might have moved due to |
5f0d5a3a | 1395 | * SLAB_TYPESAFE_BY_RCU rules. |
242922a0 FW |
1396 | * |
1397 | * We steal the timer reference. If that fails timer has | |
1398 | * already fired or someone else deleted it. Just drop ref | |
1399 | * and move to next entry. | |
1400 | */ | |
1401 | if (net_eq(nf_ct_net(tmp), net) && | |
1402 | nf_ct_is_confirmed(tmp) && | |
242922a0 FW |
1403 | nf_ct_delete(tmp, 0, 0)) |
1404 | drops++; | |
1405 | ||
1406 | nf_ct_put(tmp); | |
9fb9cbb1 | 1407 | } |
3e86638e | 1408 | |
242922a0 FW |
1409 | return drops; |
1410 | } | |
9fb9cbb1 | 1411 | |
f393808d | 1412 | static noinline int early_drop(struct net *net, unsigned int hash) |
242922a0 | 1413 | { |
f393808d | 1414 | unsigned int i, bucket; |
9fb9cbb1 | 1415 | |
242922a0 FW |
1416 | for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { |
1417 | struct hlist_nulls_head *ct_hash; | |
f393808d | 1418 | unsigned int hsize, drops; |
242922a0 | 1419 | |
3101e0fc | 1420 | rcu_read_lock(); |
92e47ba8 | 1421 | nf_conntrack_get_ht(&ct_hash, &hsize); |
f393808d VK |
1422 | if (!i) |
1423 | bucket = reciprocal_scale(hash, hsize); | |
1424 | else | |
1425 | bucket = (bucket + 1) % hsize; | |
242922a0 | 1426 | |
f393808d | 1427 | drops = early_drop_list(net, &ct_hash[bucket]); |
3101e0fc LZ |
1428 | rcu_read_unlock(); |
1429 | ||
242922a0 FW |
1430 | if (drops) { |
1431 | NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); | |
1432 | return true; | |
74138511 | 1433 | } |
9fb9cbb1 | 1434 | } |
3e86638e | 1435 | |
242922a0 | 1436 | return false; |
9fb9cbb1 YK |
1437 | } |
1438 | ||
c6dd940b FW |
1439 | static bool gc_worker_skip_ct(const struct nf_conn *ct) |
1440 | { | |
1441 | return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct); | |
1442 | } | |
1443 | ||
1444 | static bool gc_worker_can_early_drop(const struct nf_conn *ct) | |
1445 | { | |
1446 | const struct nf_conntrack_l4proto *l4proto; | |
df25455e | 1447 | u8 protonum = nf_ct_protonum(ct); |
c6dd940b | 1448 | |
df25455e VB |
1449 | if (test_bit(IPS_OFFLOAD_BIT, &ct->status) && protonum != IPPROTO_UDP) |
1450 | return false; | |
c6dd940b FW |
1451 | if (!test_bit(IPS_ASSURED_BIT, &ct->status)) |
1452 | return true; | |
1453 | ||
df25455e | 1454 | l4proto = nf_ct_l4proto_find(protonum); |
c6dd940b FW |
1455 | if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) |
1456 | return true; | |
1457 | ||
1458 | return false; | |
1459 | } | |
1460 | ||
b87a2f91 FW |
1461 | static void gc_worker(struct work_struct *work) |
1462 | { | |
4608fdfc | 1463 | unsigned int i, hashsz, nf_conntrack_max95 = 0; |
2cfadb76 | 1464 | u32 end_time, start_time = nfct_time_stamp; |
b87a2f91 | 1465 | struct conntrack_gc_work *gc_work; |
2cfadb76 FW |
1466 | unsigned int expired_count = 0; |
1467 | unsigned long next_run; | |
1468 | s32 delta_time; | |
95eabdd2 | 1469 | long count; |
2cfadb76 | 1470 | |
b87a2f91 FW |
1471 | gc_work = container_of(work, struct conntrack_gc_work, dwork.work); |
1472 | ||
4608fdfc | 1473 | i = gc_work->next_bucket; |
c6dd940b FW |
1474 | if (gc_work->early_drop) |
1475 | nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; | |
b87a2f91 | 1476 | |
2cfadb76 FW |
1477 | if (i == 0) { |
1478 | gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT; | |
2aa19275 | 1479 | gc_work->count = GC_SCAN_INITIAL_COUNT; |
2cfadb76 FW |
1480 | gc_work->start_time = start_time; |
1481 | } | |
1482 | ||
1483 | next_run = gc_work->avg_timeout; | |
95eabdd2 | 1484 | count = gc_work->count; |
2cfadb76 FW |
1485 | |
1486 | end_time = start_time + GC_SCAN_MAX_DURATION; | |
1487 | ||
b87a2f91 FW |
1488 | do { |
1489 | struct nf_conntrack_tuple_hash *h; | |
1490 | struct hlist_nulls_head *ct_hash; | |
1491 | struct hlist_nulls_node *n; | |
b87a2f91 FW |
1492 | struct nf_conn *tmp; |
1493 | ||
b87a2f91 FW |
1494 | rcu_read_lock(); |
1495 | ||
1496 | nf_conntrack_get_ht(&ct_hash, &hashsz); | |
4608fdfc FW |
1497 | if (i >= hashsz) { |
1498 | rcu_read_unlock(); | |
1499 | break; | |
1500 | } | |
b87a2f91 FW |
1501 | |
1502 | hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { | |
c53bd0e9 | 1503 | struct nf_conntrack_net *cnet; |
c6dd940b | 1504 | struct net *net; |
95eabdd2 | 1505 | long expires; |
c6dd940b | 1506 | |
b87a2f91 FW |
1507 | tmp = nf_ct_tuplehash_to_ctrack(h); |
1508 | ||
90964016 PNA |
1509 | if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { |
1510 | nf_ct_offload_timeout(tmp); | |
df25455e VB |
1511 | if (!nf_conntrack_max95) |
1512 | continue; | |
90964016 PNA |
1513 | } |
1514 | ||
2cfadb76 FW |
1515 | if (expired_count > GC_SCAN_EXPIRED_MAX) { |
1516 | rcu_read_unlock(); | |
1517 | ||
1518 | gc_work->next_bucket = i; | |
1519 | gc_work->avg_timeout = next_run; | |
95eabdd2 | 1520 | gc_work->count = count; |
2cfadb76 FW |
1521 | |
1522 | delta_time = nfct_time_stamp - gc_work->start_time; | |
1523 | ||
1524 | /* re-sched immediately if total cycle time is exceeded */ | |
1525 | next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX; | |
1526 | goto early_exit; | |
1527 | } | |
1528 | ||
b87a2f91 FW |
1529 | if (nf_ct_is_expired(tmp)) { |
1530 | nf_ct_gc_expired(tmp); | |
2cfadb76 | 1531 | expired_count++; |
b87a2f91 FW |
1532 | continue; |
1533 | } | |
c6dd940b | 1534 | |
2cfadb76 | 1535 | expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP); |
95eabdd2 | 1536 | expires = (expires - (long)next_run) / ++count; |
2cfadb76 | 1537 | next_run += expires; |
2cfadb76 | 1538 | |
c6dd940b FW |
1539 | if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) |
1540 | continue; | |
1541 | ||
1542 | net = nf_ct_net(tmp); | |
0418b989 | 1543 | cnet = nf_ct_pernet(net); |
c53bd0e9 | 1544 | if (atomic_read(&cnet->count) < nf_conntrack_max95) |
c6dd940b FW |
1545 | continue; |
1546 | ||
1547 | /* need to take reference to avoid possible races */ | |
71977437 | 1548 | if (!refcount_inc_not_zero(&tmp->ct_general.use)) |
c6dd940b FW |
1549 | continue; |
1550 | ||
0ed8f619 FW |
1551 | /* load ->status after refcount increase */ |
1552 | smp_acquire__after_ctrl_dep(); | |
1553 | ||
c6dd940b FW |
1554 | if (gc_worker_skip_ct(tmp)) { |
1555 | nf_ct_put(tmp); | |
1556 | continue; | |
1557 | } | |
1558 | ||
2cfadb76 | 1559 | if (gc_worker_can_early_drop(tmp)) { |
c6dd940b | 1560 | nf_ct_kill(tmp); |
2cfadb76 FW |
1561 | expired_count++; |
1562 | } | |
c6dd940b FW |
1563 | |
1564 | nf_ct_put(tmp); | |
b87a2f91 FW |
1565 | } |
1566 | ||
1567 | /* could check get_nulls_value() here and restart if ct | |
1568 | * was moved to another chain. But given gc is best-effort | |
1569 | * we will just continue with next hash slot. | |
1570 | */ | |
1571 | rcu_read_unlock(); | |
ffa53c58 | 1572 | cond_resched(); |
4608fdfc FW |
1573 | i++; |
1574 | ||
2cfadb76 FW |
1575 | delta_time = nfct_time_stamp - end_time; |
1576 | if (delta_time > 0 && i < hashsz) { | |
1577 | gc_work->avg_timeout = next_run; | |
95eabdd2 | 1578 | gc_work->count = count; |
4608fdfc FW |
1579 | gc_work->next_bucket = i; |
1580 | next_run = 0; | |
2cfadb76 | 1581 | goto early_exit; |
4608fdfc FW |
1582 | } |
1583 | } while (i < hashsz); | |
b87a2f91 | 1584 | |
2cfadb76 FW |
1585 | gc_work->next_bucket = 0; |
1586 | ||
1587 | next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX); | |
1588 | ||
1589 | delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1); | |
1590 | if (next_run > (unsigned long)delta_time) | |
1591 | next_run -= delta_time; | |
1592 | else | |
1593 | next_run = 1; | |
1594 | ||
1595 | early_exit: | |
b87a2f91 FW |
1596 | if (gc_work->exiting) |
1597 | return; | |
1598 | ||
2cfadb76 | 1599 | if (next_run) |
4608fdfc | 1600 | gc_work->early_drop = false; |
2cfadb76 | 1601 | |
0984d427 | 1602 | queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); |
b87a2f91 FW |
1603 | } |
1604 | ||
1605 | static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) | |
1606 | { | |
2cfadb76 | 1607 | INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); |
b87a2f91 FW |
1608 | gc_work->exiting = false; |
1609 | } | |
1610 | ||
99f07e91 | 1611 | static struct nf_conn * |
308ac914 DB |
1612 | __nf_conntrack_alloc(struct net *net, |
1613 | const struct nf_conntrack_zone *zone, | |
99f07e91 CG |
1614 | const struct nf_conntrack_tuple *orig, |
1615 | const struct nf_conntrack_tuple *repl, | |
1616 | gfp_t gfp, u32 hash) | |
9fb9cbb1 | 1617 | { |
0418b989 | 1618 | struct nf_conntrack_net *cnet = nf_ct_pernet(net); |
c53bd0e9 | 1619 | unsigned int ct_count; |
cd7fcbf1 | 1620 | struct nf_conn *ct; |
9fb9cbb1 | 1621 | |
5251e2d2 | 1622 | /* We don't want any race condition at early drop stage */ |
c53bd0e9 | 1623 | ct_count = atomic_inc_return(&cnet->count); |
5251e2d2 | 1624 | |
c53bd0e9 | 1625 | if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { |
93bb0ceb | 1626 | if (!early_drop(net, hash)) { |
c6dd940b FW |
1627 | if (!conntrack_gc_work.early_drop) |
1628 | conntrack_gc_work.early_drop = true; | |
c53bd0e9 | 1629 | atomic_dec(&cnet->count); |
e87cc472 | 1630 | net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); |
9fb9cbb1 YK |
1631 | return ERR_PTR(-ENOMEM); |
1632 | } | |
1633 | } | |
1634 | ||
941297f4 ED |
1635 | /* |
1636 | * Do not use kmem_cache_zalloc(), as this cache uses | |
5f0d5a3a | 1637 | * SLAB_TYPESAFE_BY_RCU. |
941297f4 | 1638 | */ |
0c5366b3 | 1639 | ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); |
5e8018fc DB |
1640 | if (ct == NULL) |
1641 | goto out; | |
1642 | ||
440f0d58 | 1643 | spin_lock_init(&ct->lock); |
c88130bc | 1644 | ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; |
941297f4 | 1645 | ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; |
c88130bc | 1646 | ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; |
99f07e91 CG |
1647 | /* save hash for reusing when confirming */ |
1648 | *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; | |
c41884ce | 1649 | ct->status = 0; |
802a7dc5 | 1650 | WRITE_ONCE(ct->timeout, 0); |
c2d9ba9b | 1651 | write_pnet(&ct->ct_net, net); |
4be1dbb7 | 1652 | memset_after(ct, 0, __nfct_init_offset); |
5e8018fc | 1653 | |
6c8dee98 | 1654 | nf_ct_zone_add(ct, zone); |
5e8018fc | 1655 | |
e53376be PNA |
1656 | /* Because we use RCU lookups, we set ct_general.use to zero before |
1657 | * this is inserted in any list. | |
941297f4 | 1658 | */ |
71977437 | 1659 | refcount_set(&ct->ct_general.use, 0); |
c88130bc | 1660 | return ct; |
5e8018fc | 1661 | out: |
c53bd0e9 | 1662 | atomic_dec(&cnet->count); |
5d0aa2cc | 1663 | return ERR_PTR(-ENOMEM); |
9fb9cbb1 | 1664 | } |
99f07e91 | 1665 | |
308ac914 DB |
1666 | struct nf_conn *nf_conntrack_alloc(struct net *net, |
1667 | const struct nf_conntrack_zone *zone, | |
99f07e91 CG |
1668 | const struct nf_conntrack_tuple *orig, |
1669 | const struct nf_conntrack_tuple *repl, | |
1670 | gfp_t gfp) | |
1671 | { | |
1672 | return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); | |
1673 | } | |
13b18339 | 1674 | EXPORT_SYMBOL_GPL(nf_conntrack_alloc); |
9fb9cbb1 | 1675 | |
c88130bc | 1676 | void nf_conntrack_free(struct nf_conn *ct) |
76507f69 | 1677 | { |
1d45209d | 1678 | struct net *net = nf_ct_net(ct); |
c53bd0e9 | 1679 | struct nf_conntrack_net *cnet; |
1d45209d | 1680 | |
e53376be | 1681 | /* A freed object has refcnt == 0, that's |
5f0d5a3a | 1682 | * the golden rule for SLAB_TYPESAFE_BY_RCU |
e53376be | 1683 | */ |
71977437 | 1684 | WARN_ON(refcount_read(&ct->ct_general.use) != 0); |
e53376be | 1685 | |
1bc91a5d FW |
1686 | if (ct->status & IPS_SRC_NAT_DONE) { |
1687 | const struct nf_nat_hook *nat_hook; | |
1688 | ||
1689 | rcu_read_lock(); | |
1690 | nat_hook = rcu_dereference(nf_nat_hook); | |
1691 | if (nat_hook) | |
1692 | nat_hook->remove_nat_bysrc(ct); | |
1693 | rcu_read_unlock(); | |
1694 | } | |
1695 | ||
1696 | kfree(ct->ext); | |
0c5366b3 | 1697 | kmem_cache_free(nf_conntrack_cachep, ct); |
0418b989 | 1698 | cnet = nf_ct_pernet(net); |
c53bd0e9 | 1699 | |
4e857c58 | 1700 | smp_mb__before_atomic(); |
c53bd0e9 | 1701 | atomic_dec(&cnet->count); |
76507f69 | 1702 | } |
13b18339 | 1703 | EXPORT_SYMBOL_GPL(nf_conntrack_free); |
9fb9cbb1 | 1704 | |
c539f017 | 1705 | |
9fb9cbb1 YK |
1706 | /* Allocate a new conntrack: we return -ENOMEM if classification |
1707 | failed due to stress. Otherwise it really is unclassifiable. */ | |
fc09e4a7 | 1708 | static noinline struct nf_conntrack_tuple_hash * |
b2a15a60 | 1709 | init_conntrack(struct net *net, struct nf_conn *tmpl, |
5a1fb391 | 1710 | const struct nf_conntrack_tuple *tuple, |
9fb9cbb1 | 1711 | struct sk_buff *skb, |
60b5f8f7 | 1712 | unsigned int dataoff, u32 hash) |
9fb9cbb1 | 1713 | { |
c88130bc | 1714 | struct nf_conn *ct; |
3c158f7f | 1715 | struct nf_conn_help *help; |
9fb9cbb1 | 1716 | struct nf_conntrack_tuple repl_tuple; |
b0a7ab4a | 1717 | #ifdef CONFIG_NF_CONNTRACK_EVENTS |
b2a15a60 | 1718 | struct nf_conntrack_ecache *ecache; |
b0a7ab4a | 1719 | #endif |
ca7433df | 1720 | struct nf_conntrack_expect *exp = NULL; |
308ac914 | 1721 | const struct nf_conntrack_zone *zone; |
60b5f8f7 | 1722 | struct nf_conn_timeout *timeout_ext; |
5e8018fc | 1723 | struct nf_conntrack_zone tmp; |
f6f2e580 | 1724 | struct nf_conntrack_net *cnet; |
9fb9cbb1 | 1725 | |
50bfbb89 | 1726 | if (!nf_ct_invert_tuple(&repl_tuple, tuple)) |
9fb9cbb1 | 1727 | return NULL; |
9fb9cbb1 | 1728 | |
5e8018fc | 1729 | zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); |
99f07e91 CG |
1730 | ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, |
1731 | hash); | |
0a9ee813 | 1732 | if (IS_ERR(ct)) |
c88130bc | 1733 | return (struct nf_conntrack_tuple_hash *)ct; |
9fb9cbb1 | 1734 | |
4440a2ab GF |
1735 | if (!nf_ct_add_synproxy(ct, tmpl)) { |
1736 | nf_conntrack_free(ct); | |
1737 | return ERR_PTR(-ENOMEM); | |
48b1de4c PM |
1738 | } |
1739 | ||
60b5f8f7 | 1740 | timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; |
60b5f8f7 | 1741 | |
60b5f8f7 | 1742 | if (timeout_ext) |
ae2d708e PNA |
1743 | nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), |
1744 | GFP_ATOMIC); | |
60b5f8f7 | 1745 | |
58401572 | 1746 | nf_ct_acct_ext_add(ct, GFP_ATOMIC); |
a992ca2a | 1747 | nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); |
c539f017 | 1748 | nf_ct_labels_ext_add(ct); |
b2a15a60 | 1749 | |
b0a7ab4a | 1750 | #ifdef CONFIG_NF_CONNTRACK_EVENTS |
b2a15a60 | 1751 | ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; |
b0a7ab4a | 1752 | |
90d1daa4 FW |
1753 | if ((ecache || net->ct.sysctl_events) && |
1754 | !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, | |
b0a7ab4a FW |
1755 | ecache ? ecache->expmask : 0, |
1756 | GFP_ATOMIC)) { | |
1757 | nf_conntrack_free(ct); | |
1758 | return ERR_PTR(-ENOMEM); | |
1759 | } | |
1760 | #endif | |
58401572 | 1761 | |
0418b989 | 1762 | cnet = nf_ct_pernet(net); |
f6f2e580 | 1763 | if (cnet->expect_count) { |
0bcfbafb | 1764 | spin_lock_bh(&nf_conntrack_expect_lock); |
ca7433df JDB |
1765 | exp = nf_ct_find_expectation(net, zone, tuple); |
1766 | if (exp) { | |
ca7433df JDB |
1767 | /* Welcome, Mr. Bond. We've been expecting you... */ |
1768 | __set_bit(IPS_EXPECTED_BIT, &ct->status); | |
1769 | /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ | |
1770 | ct->master = exp->master; | |
1771 | if (exp->helper) { | |
440534d3 | 1772 | help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); |
ca7433df JDB |
1773 | if (help) |
1774 | rcu_assign_pointer(help->helper, exp->helper); | |
1775 | } | |
ceceae1b | 1776 | |
9fb9cbb1 | 1777 | #ifdef CONFIG_NF_CONNTRACK_MARK |
52d1aa8b | 1778 | ct->mark = READ_ONCE(exp->master->mark); |
7c9728c3 JM |
1779 | #endif |
1780 | #ifdef CONFIG_NF_CONNTRACK_SECMARK | |
ca7433df | 1781 | ct->secmark = exp->master->secmark; |
9fb9cbb1 | 1782 | #endif |
ca7433df JDB |
1783 | NF_CT_STAT_INC(net, expect_new); |
1784 | } | |
0bcfbafb | 1785 | spin_unlock_bh(&nf_conntrack_expect_lock); |
ca7433df | 1786 | } |
b1185090 | 1787 | if (!exp && tmpl) |
b2a15a60 | 1788 | __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); |
9fb9cbb1 | 1789 | |
0ed8f619 FW |
1790 | /* Other CPU might have obtained a pointer to this object before it was |
1791 | * released. Because refcount is 0, refcount_inc_not_zero() will fail. | |
1792 | * | |
1793 | * After refcount_set(1) it will succeed; ensure that zeroing of | |
1794 | * ct->status and the correct ct->net pointer are visible; else other | |
1795 | * core might observe CONFIRMED bit which means the entry is valid and | |
1796 | * in the hash table, but its not (anymore). | |
1797 | */ | |
1798 | smp_wmb(); | |
1799 | ||
8a75a2c1 | 1800 | /* Now it is going to be associated with an sk_buff, set refcount to 1. */ |
71977437 | 1801 | refcount_set(&ct->ct_general.use, 1); |
9fb9cbb1 | 1802 | |
9fb9cbb1 YK |
1803 | if (exp) { |
1804 | if (exp->expectfn) | |
c88130bc | 1805 | exp->expectfn(ct, exp); |
6823645d | 1806 | nf_ct_expect_put(exp); |
9fb9cbb1 YK |
1807 | } |
1808 | ||
c88130bc | 1809 | return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; |
9fb9cbb1 YK |
1810 | } |
1811 | ||
fc09e4a7 FW |
1812 | /* On success, returns 0, sets skb->_nfct | ctinfo */ |
1813 | static int | |
93e66024 | 1814 | resolve_normal_ct(struct nf_conn *tmpl, |
a702a65f | 1815 | struct sk_buff *skb, |
9fb9cbb1 | 1816 | unsigned int dataoff, |
9fb9cbb1 | 1817 | u_int8_t protonum, |
93e66024 | 1818 | const struct nf_hook_state *state) |
9fb9cbb1 | 1819 | { |
308ac914 | 1820 | const struct nf_conntrack_zone *zone; |
9fb9cbb1 YK |
1821 | struct nf_conntrack_tuple tuple; |
1822 | struct nf_conntrack_tuple_hash *h; | |
fc09e4a7 | 1823 | enum ip_conntrack_info ctinfo; |
5e8018fc | 1824 | struct nf_conntrack_zone tmp; |
b16ac3c4 | 1825 | u32 hash, zone_id, rid; |
9fb9cbb1 YK |
1826 | struct nf_conn *ct; |
1827 | ||
bbe735e4 | 1828 | if (!nf_ct_get_tuple(skb, skb_network_offset(skb), |
93e66024 | 1829 | dataoff, state->pf, protonum, state->net, |
50bfbb89 | 1830 | &tuple)) |
fc09e4a7 | 1831 | return 0; |
9fb9cbb1 YK |
1832 | |
1833 | /* look for tuple match */ | |
5e8018fc | 1834 | zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); |
b16ac3c4 FW |
1835 | |
1836 | zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); | |
1837 | hash = hash_conntrack_raw(&tuple, zone_id, state->net); | |
93e66024 | 1838 | h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); |
b16ac3c4 FW |
1839 | |
1840 | if (!h) { | |
1841 | rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); | |
1842 | if (zone_id != rid) { | |
1843 | u32 tmp = hash_conntrack_raw(&tuple, rid, state->net); | |
1844 | ||
1845 | h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp); | |
1846 | } | |
1847 | } | |
1848 | ||
9fb9cbb1 | 1849 | if (!h) { |
303e0c55 | 1850 | h = init_conntrack(state->net, tmpl, &tuple, |
60b5f8f7 | 1851 | skb, dataoff, hash); |
9fb9cbb1 | 1852 | if (!h) |
fc09e4a7 | 1853 | return 0; |
9fb9cbb1 | 1854 | if (IS_ERR(h)) |
fc09e4a7 | 1855 | return PTR_ERR(h); |
9fb9cbb1 YK |
1856 | } |
1857 | ct = nf_ct_tuplehash_to_ctrack(h); | |
1858 | ||
1859 | /* It exists; we have (non-exclusive) reference. */ | |
1860 | if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { | |
fc09e4a7 | 1861 | ctinfo = IP_CT_ESTABLISHED_REPLY; |
9fb9cbb1 | 1862 | } else { |
4883ec51 FW |
1863 | unsigned long status = READ_ONCE(ct->status); |
1864 | ||
9fb9cbb1 | 1865 | /* Once we've had two way comms, always ESTABLISHED. */ |
4883ec51 | 1866 | if (likely(status & IPS_SEEN_REPLY)) |
fc09e4a7 | 1867 | ctinfo = IP_CT_ESTABLISHED; |
4883ec51 | 1868 | else if (status & IPS_EXPECTED) |
fc09e4a7 | 1869 | ctinfo = IP_CT_RELATED; |
4883ec51 | 1870 | else |
fc09e4a7 | 1871 | ctinfo = IP_CT_NEW; |
9fb9cbb1 | 1872 | } |
fc09e4a7 FW |
1873 | nf_ct_set(skb, ct, ctinfo); |
1874 | return 0; | |
9fb9cbb1 YK |
1875 | } |
1876 | ||
6fe78fa4 FW |
1877 | /* |
1878 | * icmp packets need special treatment to handle error messages that are | |
1879 | * related to a connection. | |
1880 | * | |
1881 | * Callers need to check if skb has a conntrack assigned when this | |
1882 | * helper returns; in such case skb belongs to an already known connection. | |
1883 | */ | |
1884 | static unsigned int __cold | |
1885 | nf_conntrack_handle_icmp(struct nf_conn *tmpl, | |
1886 | struct sk_buff *skb, | |
1887 | unsigned int dataoff, | |
1888 | u8 protonum, | |
1889 | const struct nf_hook_state *state) | |
1890 | { | |
1891 | int ret; | |
1892 | ||
1893 | if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP) | |
1894 | ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state); | |
1895 | #if IS_ENABLED(CONFIG_IPV6) | |
1896 | else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6) | |
1897 | ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state); | |
1898 | #endif | |
1899 | else | |
1900 | return NF_ACCEPT; | |
1901 | ||
b1328e54 | 1902 | if (ret <= 0) |
6fe78fa4 | 1903 | NF_CT_STAT_INC_ATOMIC(state->net, error); |
6fe78fa4 FW |
1904 | |
1905 | return ret; | |
1906 | } | |
1907 | ||
44fb87f6 FW |
1908 | static int generic_packet(struct nf_conn *ct, struct sk_buff *skb, |
1909 | enum ip_conntrack_info ctinfo) | |
1910 | { | |
1911 | const unsigned int *timeout = nf_ct_timeout_lookup(ct); | |
1912 | ||
1913 | if (!timeout) | |
1914 | timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; | |
1915 | ||
1916 | nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); | |
1917 | return NF_ACCEPT; | |
1918 | } | |
1919 | ||
a47c5404 FW |
1920 | /* Returns verdict for packet, or -1 for invalid. */ |
1921 | static int nf_conntrack_handle_packet(struct nf_conn *ct, | |
1922 | struct sk_buff *skb, | |
1923 | unsigned int dataoff, | |
1924 | enum ip_conntrack_info ctinfo, | |
1925 | const struct nf_hook_state *state) | |
1926 | { | |
1927 | switch (nf_ct_protonum(ct)) { | |
1928 | case IPPROTO_TCP: | |
1929 | return nf_conntrack_tcp_packet(ct, skb, dataoff, | |
1930 | ctinfo, state); | |
1931 | case IPPROTO_UDP: | |
1932 | return nf_conntrack_udp_packet(ct, skb, dataoff, | |
1933 | ctinfo, state); | |
1934 | case IPPROTO_ICMP: | |
1935 | return nf_conntrack_icmp_packet(ct, skb, ctinfo, state); | |
81e01647 | 1936 | #if IS_ENABLED(CONFIG_IPV6) |
a47c5404 FW |
1937 | case IPPROTO_ICMPV6: |
1938 | return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state); | |
81e01647 | 1939 | #endif |
a47c5404 FW |
1940 | #ifdef CONFIG_NF_CT_PROTO_UDPLITE |
1941 | case IPPROTO_UDPLITE: | |
1942 | return nf_conntrack_udplite_packet(ct, skb, dataoff, | |
1943 | ctinfo, state); | |
1944 | #endif | |
1945 | #ifdef CONFIG_NF_CT_PROTO_SCTP | |
1946 | case IPPROTO_SCTP: | |
1947 | return nf_conntrack_sctp_packet(ct, skb, dataoff, | |
1948 | ctinfo, state); | |
1949 | #endif | |
1950 | #ifdef CONFIG_NF_CT_PROTO_DCCP | |
1951 | case IPPROTO_DCCP: | |
1952 | return nf_conntrack_dccp_packet(ct, skb, dataoff, | |
1953 | ctinfo, state); | |
44fb87f6 FW |
1954 | #endif |
1955 | #ifdef CONFIG_NF_CT_PROTO_GRE | |
1956 | case IPPROTO_GRE: | |
1957 | return nf_conntrack_gre_packet(ct, skb, dataoff, | |
1958 | ctinfo, state); | |
a47c5404 FW |
1959 | #endif |
1960 | } | |
1961 | ||
44fb87f6 | 1962 | return generic_packet(ct, skb, ctinfo); |
a47c5404 FW |
1963 | } |
1964 | ||
9fb9cbb1 | 1965 | unsigned int |
93e66024 | 1966 | nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) |
9fb9cbb1 | 1967 | { |
9fb9cbb1 | 1968 | enum ip_conntrack_info ctinfo; |
93e66024 | 1969 | struct nf_conn *ct, *tmpl; |
9fb9cbb1 | 1970 | u_int8_t protonum; |
6816d931 | 1971 | int dataoff, ret; |
9fb9cbb1 | 1972 | |
97a6ad13 | 1973 | tmpl = nf_ct_get(skb, &ctinfo); |
cc41c84b | 1974 | if (tmpl || ctinfo == IP_CT_UNTRACKED) { |
b2a15a60 | 1975 | /* Previously seen (loopback or untracked)? Ignore. */ |
cc41c84b | 1976 | if ((tmpl && !nf_ct_is_template(tmpl)) || |
4afc41df | 1977 | ctinfo == IP_CT_UNTRACKED) |
b2a15a60 | 1978 | return NF_ACCEPT; |
a9e419dc | 1979 | skb->_nfct = 0; |
9fb9cbb1 YK |
1980 | } |
1981 | ||
e2361cb9 | 1982 | /* rcu_read_lock()ed by nf_hook_thresh */ |
93e66024 | 1983 | dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); |
6816d931 | 1984 | if (dataoff <= 0) { |
93e66024 | 1985 | NF_CT_STAT_INC_ATOMIC(state->net, invalid); |
6816d931 | 1986 | ret = NF_ACCEPT; |
b2a15a60 | 1987 | goto out; |
9fb9cbb1 YK |
1988 | } |
1989 | ||
6fe78fa4 FW |
1990 | if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) { |
1991 | ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff, | |
1992 | protonum, state); | |
74c51a14 | 1993 | if (ret <= 0) { |
b2a15a60 PM |
1994 | ret = -ret; |
1995 | goto out; | |
74c51a14 | 1996 | } |
88ed01d1 | 1997 | /* ICMP[v6] protocol trackers may assign one conntrack. */ |
a9e419dc | 1998 | if (skb->_nfct) |
88ed01d1 | 1999 | goto out; |
9fb9cbb1 | 2000 | } |
08733a0c | 2001 | repeat: |
93e66024 | 2002 | ret = resolve_normal_ct(tmpl, skb, dataoff, |
303e0c55 | 2003 | protonum, state); |
fc09e4a7 | 2004 | if (ret < 0) { |
9fb9cbb1 | 2005 | /* Too stressed to deal. */ |
93e66024 | 2006 | NF_CT_STAT_INC_ATOMIC(state->net, drop); |
b2a15a60 PM |
2007 | ret = NF_DROP; |
2008 | goto out; | |
9fb9cbb1 YK |
2009 | } |
2010 | ||
fc09e4a7 FW |
2011 | ct = nf_ct_get(skb, &ctinfo); |
2012 | if (!ct) { | |
2013 | /* Not valid part of a connection */ | |
93e66024 | 2014 | NF_CT_STAT_INC_ATOMIC(state->net, invalid); |
fc09e4a7 FW |
2015 | ret = NF_ACCEPT; |
2016 | goto out; | |
2017 | } | |
9fb9cbb1 | 2018 | |
44fb87f6 | 2019 | ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state); |
ec8d5409 | 2020 | if (ret <= 0) { |
9fb9cbb1 YK |
2021 | /* Invalid: inverse of the return code tells |
2022 | * the netfilter core what to do */ | |
408bdcfc | 2023 | nf_ct_put(ct); |
a9e419dc | 2024 | skb->_nfct = 0; |
56a62e22 AB |
2025 | /* Special case: TCP tracker reports an attempt to reopen a |
2026 | * closed/aborted connection. We have to go back and create a | |
2027 | * fresh conntrack. | |
2028 | */ | |
2029 | if (ret == -NF_REPEAT) | |
2030 | goto repeat; | |
830af2eb FW |
2031 | |
2032 | NF_CT_STAT_INC_ATOMIC(state->net, invalid); | |
2033 | if (ret == -NF_DROP) | |
2034 | NF_CT_STAT_INC_ATOMIC(state->net, drop); | |
2035 | ||
b2a15a60 PM |
2036 | ret = -ret; |
2037 | goto out; | |
9fb9cbb1 YK |
2038 | } |
2039 | ||
fc09e4a7 FW |
2040 | if (ctinfo == IP_CT_ESTABLISHED_REPLY && |
2041 | !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) | |
858b3133 | 2042 | nf_conntrack_event_cache(IPCT_REPLY, ct); |
b2a15a60 | 2043 | out: |
56a62e22 AB |
2044 | if (tmpl) |
2045 | nf_ct_put(tmpl); | |
9fb9cbb1 YK |
2046 | |
2047 | return ret; | |
2048 | } | |
13b18339 | 2049 | EXPORT_SYMBOL_GPL(nf_conntrack_in); |
9fb9cbb1 | 2050 | |
5b1158e9 JK |
2051 | /* Alter reply tuple (maybe alter helper). This is for NAT, and is |
2052 | implicitly racy: see __nf_conntrack_confirm */ | |
2053 | void nf_conntrack_alter_reply(struct nf_conn *ct, | |
2054 | const struct nf_conntrack_tuple *newreply) | |
2055 | { | |
2056 | struct nf_conn_help *help = nfct_help(ct); | |
2057 | ||
5b1158e9 | 2058 | /* Should be unconfirmed, so not in hash table yet */ |
44d6e2f2 | 2059 | WARN_ON(nf_ct_is_confirmed(ct)); |
5b1158e9 | 2060 | |
3c9fba65 | 2061 | nf_ct_dump_tuple(newreply); |
5b1158e9 JK |
2062 | |
2063 | ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; | |
ef1a5a50 | 2064 | if (ct->master || (help && !hlist_empty(&help->expectations))) |
c52fbb41 | 2065 | return; |
5b1158e9 | 2066 | } |
13b18339 | 2067 | EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); |
5b1158e9 | 2068 | |
9fb9cbb1 YK |
2069 | /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ |
2070 | void __nf_ct_refresh_acct(struct nf_conn *ct, | |
2071 | enum ip_conntrack_info ctinfo, | |
2072 | const struct sk_buff *skb, | |
cc169213 FW |
2073 | u32 extra_jiffies, |
2074 | bool do_acct) | |
9fb9cbb1 | 2075 | { |
997ae831 | 2076 | /* Only update if this is not a fixed timeout */ |
47d95045 PM |
2077 | if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) |
2078 | goto acct; | |
997ae831 | 2079 | |
9fb9cbb1 | 2080 | /* If not in hash table, timer will not be active yet */ |
f330a7fd FW |
2081 | if (nf_ct_is_confirmed(ct)) |
2082 | extra_jiffies += nfct_time_stamp; | |
9fb9cbb1 | 2083 | |
e37542ba ED |
2084 | if (READ_ONCE(ct->timeout) != extra_jiffies) |
2085 | WRITE_ONCE(ct->timeout, extra_jiffies); | |
47d95045 | 2086 | acct: |
ba76738c | 2087 | if (do_acct) |
8ac2bd35 | 2088 | nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); |
9fb9cbb1 | 2089 | } |
13b18339 | 2090 | EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); |
9fb9cbb1 | 2091 | |
ad66713f FW |
2092 | bool nf_ct_kill_acct(struct nf_conn *ct, |
2093 | enum ip_conntrack_info ctinfo, | |
2094 | const struct sk_buff *skb) | |
51091764 | 2095 | { |
8ac2bd35 | 2096 | nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); |
58401572 | 2097 | |
f330a7fd | 2098 | return nf_ct_delete(ct, 0, 0); |
51091764 | 2099 | } |
ad66713f | 2100 | EXPORT_SYMBOL_GPL(nf_ct_kill_acct); |
51091764 | 2101 | |
c0cd1156 | 2102 | #if IS_ENABLED(CONFIG_NF_CT_NETLINK) |
c1d10adb PNA |
2103 | |
2104 | #include <linux/netfilter/nfnetlink.h> | |
2105 | #include <linux/netfilter/nfnetlink_conntrack.h> | |
57b47a53 IM |
2106 | #include <linux/mutex.h> |
2107 | ||
05ba4c89 | 2108 | /* Generic function for tcp/udp/sctp/dccp and alike. */ |
fdf70832 | 2109 | int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, |
c1d10adb PNA |
2110 | const struct nf_conntrack_tuple *tuple) |
2111 | { | |
bae65be8 DM |
2112 | if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || |
2113 | nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) | |
2114 | goto nla_put_failure; | |
c1d10adb PNA |
2115 | return 0; |
2116 | ||
df6fb868 | 2117 | nla_put_failure: |
c1d10adb PNA |
2118 | return -1; |
2119 | } | |
fdf70832 | 2120 | EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); |
c1d10adb | 2121 | |
f73e924c PM |
2122 | const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { |
2123 | [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, | |
2124 | [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, | |
c1d10adb | 2125 | }; |
f73e924c | 2126 | EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); |
c1d10adb | 2127 | |
fdf70832 | 2128 | int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], |
cb8aa9a3 RB |
2129 | struct nf_conntrack_tuple *t, |
2130 | u_int32_t flags) | |
c1d10adb | 2131 | { |
cb8aa9a3 RB |
2132 | if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) { |
2133 | if (!tb[CTA_PROTO_SRC_PORT]) | |
2134 | return -EINVAL; | |
c1d10adb | 2135 | |
cb8aa9a3 RB |
2136 | t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); |
2137 | } | |
2138 | ||
2139 | if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) { | |
2140 | if (!tb[CTA_PROTO_DST_PORT]) | |
2141 | return -EINVAL; | |
2142 | ||
2143 | t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); | |
2144 | } | |
c1d10adb PNA |
2145 | |
2146 | return 0; | |
2147 | } | |
fdf70832 | 2148 | EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); |
5c0de29d | 2149 | |
5caaed15 | 2150 | unsigned int nf_ct_port_nlattr_tuple_size(void) |
5c0de29d | 2151 | { |
5caaed15 FW |
2152 | static unsigned int size __read_mostly; |
2153 | ||
2154 | if (!size) | |
2155 | size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); | |
2156 | ||
2157 | return size; | |
5c0de29d HE |
2158 | } |
2159 | EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); | |
c1d10adb PNA |
2160 | #endif |
2161 | ||
9fb9cbb1 | 2162 | /* Used by ipt_REJECT and ip6t_REJECT. */ |
312a0c16 | 2163 | static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) |
9fb9cbb1 YK |
2164 | { |
2165 | struct nf_conn *ct; | |
2166 | enum ip_conntrack_info ctinfo; | |
2167 | ||
2168 | /* This ICMP is in reverse direction to the packet which caused it */ | |
2169 | ct = nf_ct_get(skb, &ctinfo); | |
2170 | if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) | |
fb048833 | 2171 | ctinfo = IP_CT_RELATED_REPLY; |
9fb9cbb1 YK |
2172 | else |
2173 | ctinfo = IP_CT_RELATED; | |
2174 | ||
2175 | /* Attach to new skbuff, and increment count */ | |
c74454fa | 2176 | nf_ct_set(nskb, ct, ctinfo); |
cb9c6836 | 2177 | nf_conntrack_get(skb_nfct(nskb)); |
9fb9cbb1 YK |
2178 | } |
2179 | ||
ee04805f | 2180 | static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, |
46c1e062 NC |
2181 | struct nf_conn *ct, |
2182 | enum ip_conntrack_info ctinfo) | |
368982cd | 2183 | { |
285c8a7a | 2184 | const struct nf_nat_hook *nat_hook; |
368982cd PNA |
2185 | struct nf_conntrack_tuple_hash *h; |
2186 | struct nf_conntrack_tuple tuple; | |
6816d931 | 2187 | unsigned int status; |
6816d931 | 2188 | int dataoff; |
368982cd PNA |
2189 | u16 l3num; |
2190 | u8 l4num; | |
2191 | ||
368982cd | 2192 | l3num = nf_ct_l3num(ct); |
368982cd | 2193 | |
6816d931 FW |
2194 | dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); |
2195 | if (dataoff <= 0) | |
368982cd PNA |
2196 | return -1; |
2197 | ||
368982cd | 2198 | if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, |
303e0c55 | 2199 | l4num, net, &tuple)) |
368982cd PNA |
2200 | return -1; |
2201 | ||
2202 | if (ct->status & IPS_SRC_NAT) { | |
2203 | memcpy(tuple.src.u3.all, | |
2204 | ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all, | |
2205 | sizeof(tuple.src.u3.all)); | |
2206 | tuple.src.u.all = | |
2207 | ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all; | |
2208 | } | |
2209 | ||
2210 | if (ct->status & IPS_DST_NAT) { | |
2211 | memcpy(tuple.dst.u3.all, | |
2212 | ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all, | |
2213 | sizeof(tuple.dst.u3.all)); | |
2214 | tuple.dst.u.all = | |
2215 | ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all; | |
2216 | } | |
2217 | ||
2218 | h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple); | |
2219 | if (!h) | |
2220 | return 0; | |
2221 | ||
2222 | /* Store status bits of the conntrack that is clashing to re-do NAT | |
2223 | * mangling according to what it has been done already to this packet. | |
2224 | */ | |
2225 | status = ct->status; | |
2226 | ||
2227 | nf_ct_put(ct); | |
2228 | ct = nf_ct_tuplehash_to_ctrack(h); | |
2229 | nf_ct_set(skb, ct, ctinfo); | |
2230 | ||
2231 | nat_hook = rcu_dereference(nf_nat_hook); | |
2232 | if (!nat_hook) | |
2233 | return 0; | |
2234 | ||
2235 | if (status & IPS_SRC_NAT && | |
2236 | nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC, | |
2237 | IP_CT_DIR_ORIGINAL) == NF_DROP) | |
2238 | return -1; | |
2239 | ||
2240 | if (status & IPS_DST_NAT && | |
2241 | nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST, | |
2242 | IP_CT_DIR_ORIGINAL) == NF_DROP) | |
2243 | return -1; | |
2244 | ||
2245 | return 0; | |
2246 | } | |
2247 | ||
ee04805f PNA |
2248 | /* This packet is coming from userspace via nf_queue, complete the packet |
2249 | * processing after the helper invocation in nf_confirm(). | |
2250 | */ | |
2251 | static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, | |
2252 | enum ip_conntrack_info ctinfo) | |
2253 | { | |
2254 | const struct nf_conntrack_helper *helper; | |
2255 | const struct nf_conn_help *help; | |
94945ad2 | 2256 | int protoff; |
ee04805f PNA |
2257 | |
2258 | help = nfct_help(ct); | |
2259 | if (!help) | |
2260 | return 0; | |
2261 | ||
2262 | helper = rcu_dereference(help->helper); | |
2263 | if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) | |
2264 | return 0; | |
2265 | ||
2266 | switch (nf_ct_l3num(ct)) { | |
2267 | case NFPROTO_IPV4: | |
2268 | protoff = skb_network_offset(skb) + ip_hdrlen(skb); | |
2269 | break; | |
2270 | #if IS_ENABLED(CONFIG_IPV6) | |
2271 | case NFPROTO_IPV6: { | |
2272 | __be16 frag_off; | |
2273 | u8 pnum; | |
2274 | ||
2275 | pnum = ipv6_hdr(skb)->nexthdr; | |
2276 | protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, | |
2277 | &frag_off); | |
2278 | if (protoff < 0 || (frag_off & htons(~0x7)) != 0) | |
2279 | return 0; | |
2280 | break; | |
2281 | } | |
2282 | #endif | |
2283 | default: | |
2284 | return 0; | |
2285 | } | |
2286 | ||
2287 | if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && | |
2288 | !nf_is_loopback_packet(skb)) { | |
2289 | if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { | |
2290 | NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); | |
2291 | return -1; | |
2292 | } | |
2293 | } | |
2294 | ||
2295 | /* We've seen it coming out the other side: confirm it */ | |
2296 | return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0; | |
2297 | } | |
2298 | ||
2299 | static int nf_conntrack_update(struct net *net, struct sk_buff *skb) | |
2300 | { | |
2301 | enum ip_conntrack_info ctinfo; | |
2302 | struct nf_conn *ct; | |
2303 | int err; | |
2304 | ||
2305 | ct = nf_ct_get(skb, &ctinfo); | |
2306 | if (!ct) | |
2307 | return 0; | |
2308 | ||
2309 | if (!nf_ct_is_confirmed(ct)) { | |
46c1e062 | 2310 | err = __nf_conntrack_update(net, skb, ct, ctinfo); |
ee04805f PNA |
2311 | if (err < 0) |
2312 | return err; | |
d005fbb8 PNA |
2313 | |
2314 | ct = nf_ct_get(skb, &ctinfo); | |
ee04805f PNA |
2315 | } |
2316 | ||
2317 | return nf_confirm_cthelper(skb, ct, ctinfo); | |
2318 | } | |
2319 | ||
b60a6040 THJ |
2320 | static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, |
2321 | const struct sk_buff *skb) | |
2322 | { | |
2323 | const struct nf_conntrack_tuple *src_tuple; | |
2324 | const struct nf_conntrack_tuple_hash *hash; | |
2325 | struct nf_conntrack_tuple srctuple; | |
2326 | enum ip_conntrack_info ctinfo; | |
2327 | struct nf_conn *ct; | |
2328 | ||
2329 | ct = nf_ct_get(skb, &ctinfo); | |
2330 | if (ct) { | |
2331 | src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); | |
2332 | memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); | |
2333 | return true; | |
2334 | } | |
2335 | ||
2336 | if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), | |
2337 | NFPROTO_IPV4, dev_net(skb->dev), | |
2338 | &srctuple)) | |
2339 | return false; | |
2340 | ||
2341 | hash = nf_conntrack_find_get(dev_net(skb->dev), | |
2342 | &nf_ct_zone_dflt, | |
2343 | &srctuple); | |
2344 | if (!hash) | |
2345 | return false; | |
2346 | ||
2347 | ct = nf_ct_tuplehash_to_ctrack(hash); | |
2348 | src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); | |
2349 | memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); | |
2350 | nf_ct_put(ct); | |
2351 | ||
2352 | return true; | |
2353 | } | |
2354 | ||
9fb9cbb1 | 2355 | /* Bring out ya dead! */ |
df0933dc | 2356 | static struct nf_conn * |
2843fb69 | 2357 | get_next_corpse(int (*iter)(struct nf_conn *i, void *data), |
8169ff58 | 2358 | const struct nf_ct_iter_data *iter_data, unsigned int *bucket) |
9fb9cbb1 | 2359 | { |
df0933dc PM |
2360 | struct nf_conntrack_tuple_hash *h; |
2361 | struct nf_conn *ct; | |
ea781f19 | 2362 | struct hlist_nulls_node *n; |
93bb0ceb | 2363 | spinlock_t *lockp; |
9fb9cbb1 | 2364 | |
56d52d48 | 2365 | for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { |
e9edc188 ED |
2366 | struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket]; |
2367 | ||
2368 | if (hlist_nulls_empty(hslot)) | |
2369 | continue; | |
2370 | ||
93bb0ceb JDB |
2371 | lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; |
2372 | local_bh_disable(); | |
b16c2919 | 2373 | nf_conntrack_lock(lockp); |
e9edc188 ED |
2374 | hlist_nulls_for_each_entry(h, n, hslot, hnnode) { |
2375 | if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) | |
2376 | continue; | |
2377 | /* All nf_conn objects are added to hash table twice, one | |
2378 | * for original direction tuple, once for the reply tuple. | |
2379 | * | |
2380 | * Exception: In the IPS_NAT_CLASH case, only the reply | |
2381 | * tuple is added (the original tuple already existed for | |
2382 | * a different object). | |
2383 | * | |
2384 | * We only need to call the iterator once for each | |
2385 | * conntrack, so we just use the 'reply' direction | |
2386 | * tuple while iterating. | |
2387 | */ | |
2388 | ct = nf_ct_tuplehash_to_ctrack(h); | |
8169ff58 PNA |
2389 | |
2390 | if (iter_data->net && | |
2391 | !net_eq(iter_data->net, nf_ct_net(ct))) | |
2392 | continue; | |
2393 | ||
2394 | if (iter(ct, iter_data->data)) | |
e9edc188 | 2395 | goto found; |
df0933dc | 2396 | } |
93bb0ceb JDB |
2397 | spin_unlock(lockp); |
2398 | local_bh_enable(); | |
d93c6258 | 2399 | cond_resched(); |
601e68e1 | 2400 | } |
b7779d06 | 2401 | |
b0feacaa FW |
2402 | return NULL; |
2403 | found: | |
71977437 | 2404 | refcount_inc(&ct->ct_general.use); |
b0feacaa FW |
2405 | spin_unlock(lockp); |
2406 | local_bh_enable(); | |
2407 | return ct; | |
2408 | } | |
2409 | ||
2843fb69 | 2410 | static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), |
8169ff58 | 2411 | const struct nf_ct_iter_data *iter_data) |
2843fb69 | 2412 | { |
e9edc188 | 2413 | unsigned int bucket = 0; |
2843fb69 | 2414 | struct nf_conn *ct; |
2843fb69 FW |
2415 | |
2416 | might_sleep(); | |
2417 | ||
e9edc188 | 2418 | mutex_lock(&nf_conntrack_mutex); |
8169ff58 | 2419 | while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) { |
e9edc188 | 2420 | /* Time to push up daises... */ |
0d02d564 | 2421 | |
8169ff58 | 2422 | nf_ct_delete(ct, iter_data->portid, iter_data->report); |
e9edc188 ED |
2423 | nf_ct_put(ct); |
2424 | cond_resched(); | |
2843fb69 | 2425 | } |
e9edc188 | 2426 | mutex_unlock(&nf_conntrack_mutex); |
2843fb69 FW |
2427 | } |
2428 | ||
8169ff58 PNA |
2429 | void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), |
2430 | const struct nf_ct_iter_data *iter_data) | |
9fb9cbb1 | 2431 | { |
8169ff58 | 2432 | struct net *net = iter_data->net; |
0418b989 | 2433 | struct nf_conntrack_net *cnet = nf_ct_pernet(net); |
9fb9cbb1 | 2434 | |
d93c6258 FW |
2435 | might_sleep(); |
2436 | ||
c53bd0e9 | 2437 | if (atomic_read(&cnet->count) == 0) |
88b68bc5 FW |
2438 | return; |
2439 | ||
8169ff58 | 2440 | nf_ct_iterate_cleanup(iter, iter_data); |
2843fb69 FW |
2441 | } |
2442 | EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); | |
9fb9cbb1 | 2443 | |
2843fb69 FW |
2444 | /** |
2445 | * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table | |
2446 | * @iter: callback to invoke for each conntrack | |
2447 | * @data: data to pass to @iter | |
2448 | * | |
2449 | * Like nf_ct_iterate_cleanup, but first marks conntracks on the | |
2450 | * unconfirmed list as dying (so they will not be inserted into | |
2451 | * main table). | |
7866cc57 FW |
2452 | * |
2453 | * Can only be called in module exit path. | |
2843fb69 FW |
2454 | */ |
2455 | void | |
2456 | nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) | |
2457 | { | |
8169ff58 | 2458 | struct nf_ct_iter_data iter_data = {}; |
2843fb69 FW |
2459 | struct net *net; |
2460 | ||
f0b07bb1 | 2461 | down_read(&net_rwsem); |
2843fb69 | 2462 | for_each_net(net) { |
0418b989 | 2463 | struct nf_conntrack_net *cnet = nf_ct_pernet(net); |
c53bd0e9 FW |
2464 | |
2465 | if (atomic_read(&cnet->count) == 0) | |
2843fb69 | 2466 | continue; |
e2a75007 | 2467 | nf_queue_nf_hook_drop(net); |
9fb9cbb1 | 2468 | } |
f0b07bb1 | 2469 | up_read(&net_rwsem); |
2843fb69 | 2470 | |
7866cc57 FW |
2471 | /* Need to wait for netns cleanup worker to finish, if its |
2472 | * running -- it might have deleted a net namespace from | |
ace53fdc FW |
2473 | * the global list, so hook drop above might not have |
2474 | * affected all namespaces. | |
7866cc57 FW |
2475 | */ |
2476 | net_ns_barrier(); | |
2477 | ||
ace53fdc FW |
2478 | /* a skb w. unconfirmed conntrack could have been reinjected just |
2479 | * before we called nf_queue_nf_hook_drop(). | |
2480 | * | |
2843fb69 FW |
2481 | * This makes sure its inserted into conntrack table. |
2482 | */ | |
2483 | synchronize_net(); | |
2484 | ||
c56716c6 | 2485 | nf_ct_ext_bump_genid(); |
8169ff58 PNA |
2486 | iter_data.data = data; |
2487 | nf_ct_iterate_cleanup(iter, &iter_data); | |
ace53fdc FW |
2488 | |
2489 | /* Another cpu might be in a rcu read section with | |
2490 | * rcu protected pointer cleared in iter callback | |
2491 | * or hidden via nf_ct_ext_bump_genid() above. | |
2492 | * | |
2493 | * Wait until those are done. | |
2494 | */ | |
2495 | synchronize_rcu(); | |
9fb9cbb1 | 2496 | } |
2843fb69 | 2497 | EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy); |
9fb9cbb1 | 2498 | |
274d383b PNA |
2499 | static int kill_all(struct nf_conn *i, void *data) |
2500 | { | |
8169ff58 | 2501 | return 1; |
274d383b PNA |
2502 | } |
2503 | ||
f94161c1 | 2504 | void nf_conntrack_cleanup_start(void) |
9fb9cbb1 | 2505 | { |
864b656f | 2506 | cleanup_nf_conntrack_bpf(); |
b87a2f91 | 2507 | conntrack_gc_work.exiting = true; |
f94161c1 G |
2508 | } |
2509 | ||
2510 | void nf_conntrack_cleanup_end(void) | |
2511 | { | |
1f4b2439 | 2512 | RCU_INIT_POINTER(nf_ct_hook, NULL); |
b87a2f91 | 2513 | cancel_delayed_work_sync(&conntrack_gc_work.dwork); |
285189c7 | 2514 | kvfree(nf_conntrack_hash); |
56d52d48 | 2515 | |
04d87001 | 2516 | nf_conntrack_proto_fini(); |
5e615b22 | 2517 | nf_conntrack_helper_fini(); |
83b4dbe1 | 2518 | nf_conntrack_expect_fini(); |
77571149 FW |
2519 | |
2520 | kmem_cache_destroy(nf_conntrack_cachep); | |
08f6547d | 2521 | } |
9fb9cbb1 | 2522 | |
f94161c1 G |
2523 | /* |
2524 | * Mishearing the voices in his head, our hero wonders how he's | |
2525 | * supposed to kill the mall. | |
2526 | */ | |
2527 | void nf_conntrack_cleanup_net(struct net *net) | |
08f6547d | 2528 | { |
dece40e8 VD |
2529 | LIST_HEAD(single); |
2530 | ||
2531 | list_add(&net->exit_list, &single); | |
2532 | nf_conntrack_cleanup_net_list(&single); | |
2533 | } | |
2534 | ||
2535 | void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) | |
2536 | { | |
8169ff58 | 2537 | struct nf_ct_iter_data iter_data = {}; |
dece40e8 | 2538 | struct net *net; |
8169ff58 | 2539 | int busy; |
dece40e8 | 2540 | |
f94161c1 G |
2541 | /* |
2542 | * This makes sure all current packets have passed through | |
2543 | * netfilter framework. Roll on, two-stage module | |
2544 | * delete... | |
2545 | */ | |
2546 | synchronize_net(); | |
dece40e8 VD |
2547 | i_see_dead_people: |
2548 | busy = 0; | |
2549 | list_for_each_entry(net, net_exit_list, exit_list) { | |
0418b989 | 2550 | struct nf_conntrack_net *cnet = nf_ct_pernet(net); |
c53bd0e9 | 2551 | |
8169ff58 PNA |
2552 | iter_data.net = net; |
2553 | nf_ct_iterate_cleanup_net(kill_all, &iter_data); | |
c53bd0e9 | 2554 | if (atomic_read(&cnet->count) != 0) |
dece40e8 VD |
2555 | busy = 1; |
2556 | } | |
2557 | if (busy) { | |
9fb9cbb1 YK |
2558 | schedule(); |
2559 | goto i_see_dead_people; | |
2560 | } | |
2561 | ||
dece40e8 | 2562 | list_for_each_entry(net, net_exit_list, exit_list) { |
dece40e8 | 2563 | nf_conntrack_ecache_pernet_fini(net); |
dece40e8 | 2564 | nf_conntrack_expect_pernet_fini(net); |
dece40e8 VD |
2565 | free_percpu(net->ct.stat); |
2566 | } | |
08f6547d AD |
2567 | } |
2568 | ||
d862a662 | 2569 | void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) |
9fb9cbb1 | 2570 | { |
ea781f19 ED |
2571 | struct hlist_nulls_head *hash; |
2572 | unsigned int nr_slots, i; | |
9fb9cbb1 | 2573 | |
9cc1c73a FW |
2574 | if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) |
2575 | return NULL; | |
2576 | ||
ea781f19 ED |
2577 | BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); |
2578 | nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); | |
9cc1c73a | 2579 | |
b9e0102a | 2580 | hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); |
9fb9cbb1 | 2581 | |
ea781f19 ED |
2582 | if (hash && nulls) |
2583 | for (i = 0; i < nr_slots; i++) | |
2584 | INIT_HLIST_NULLS_HEAD(&hash[i], i); | |
9fb9cbb1 YK |
2585 | |
2586 | return hash; | |
2587 | } | |
ac565e5f | 2588 | EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); |
9fb9cbb1 | 2589 | |
3183ab89 | 2590 | int nf_conntrack_hash_resize(unsigned int hashsize) |
9fb9cbb1 | 2591 | { |
3183ab89 FW |
2592 | int i, bucket; |
2593 | unsigned int old_size; | |
ea781f19 | 2594 | struct hlist_nulls_head *hash, *old_hash; |
9fb9cbb1 | 2595 | struct nf_conntrack_tuple_hash *h; |
5d0aa2cc | 2596 | struct nf_conn *ct; |
9fb9cbb1 | 2597 | |
9fb9cbb1 YK |
2598 | if (!hashsize) |
2599 | return -EINVAL; | |
2600 | ||
d862a662 | 2601 | hash = nf_ct_alloc_hashtable(&hashsize, 1); |
9fb9cbb1 YK |
2602 | if (!hash) |
2603 | return -ENOMEM; | |
2604 | ||
e9edc188 | 2605 | mutex_lock(&nf_conntrack_mutex); |
3183ab89 FW |
2606 | old_size = nf_conntrack_htable_size; |
2607 | if (old_size == hashsize) { | |
e9edc188 | 2608 | mutex_unlock(&nf_conntrack_mutex); |
285189c7 | 2609 | kvfree(hash); |
3183ab89 FW |
2610 | return 0; |
2611 | } | |
2612 | ||
93bb0ceb JDB |
2613 | local_bh_disable(); |
2614 | nf_conntrack_all_lock(); | |
a3efd812 | 2615 | write_seqcount_begin(&nf_conntrack_generation); |
93bb0ceb | 2616 | |
76507f69 PM |
2617 | /* Lookups in the old hash might happen in parallel, which means we |
2618 | * might get false negatives during connection lookup. New connections | |
2619 | * created because of a false negative won't make it into the hash | |
93bb0ceb | 2620 | * though since that required taking the locks. |
76507f69 | 2621 | */ |
93bb0ceb | 2622 | |
56d52d48 FW |
2623 | for (i = 0; i < nf_conntrack_htable_size; i++) { |
2624 | while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { | |
b16ac3c4 FW |
2625 | unsigned int zone_id; |
2626 | ||
56d52d48 FW |
2627 | h = hlist_nulls_entry(nf_conntrack_hash[i].first, |
2628 | struct nf_conntrack_tuple_hash, hnnode); | |
5d0aa2cc | 2629 | ct = nf_ct_tuplehash_to_ctrack(h); |
ea781f19 | 2630 | hlist_nulls_del_rcu(&h->hnnode); |
b16ac3c4 FW |
2631 | |
2632 | zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h)); | |
1b8c8a9f | 2633 | bucket = __hash_conntrack(nf_ct_net(ct), |
b16ac3c4 | 2634 | &h->tuple, zone_id, hashsize); |
ea781f19 | 2635 | hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); |
9fb9cbb1 YK |
2636 | } |
2637 | } | |
56d52d48 | 2638 | old_hash = nf_conntrack_hash; |
9fb9cbb1 | 2639 | |
56d52d48 FW |
2640 | nf_conntrack_hash = hash; |
2641 | nf_conntrack_htable_size = hashsize; | |
93bb0ceb | 2642 | |
a3efd812 | 2643 | write_seqcount_end(&nf_conntrack_generation); |
93bb0ceb JDB |
2644 | nf_conntrack_all_unlock(); |
2645 | local_bh_enable(); | |
9fb9cbb1 | 2646 | |
e9edc188 ED |
2647 | mutex_unlock(&nf_conntrack_mutex); |
2648 | ||
5e3c61f9 | 2649 | synchronize_net(); |
285189c7 | 2650 | kvfree(old_hash); |
9fb9cbb1 YK |
2651 | return 0; |
2652 | } | |
3183ab89 | 2653 | |
e4dca7b7 | 2654 | int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) |
3183ab89 FW |
2655 | { |
2656 | unsigned int hashsize; | |
2657 | int rc; | |
2658 | ||
2659 | if (current->nsproxy->net_ns != &init_net) | |
2660 | return -EOPNOTSUPP; | |
2661 | ||
2662 | /* On boot, we can set this without any fancy locking. */ | |
2045cdfa | 2663 | if (!nf_conntrack_hash) |
3183ab89 FW |
2664 | return param_set_uint(val, kp); |
2665 | ||
2666 | rc = kstrtouint(val, 0, &hashsize); | |
2667 | if (rc) | |
2668 | return rc; | |
2669 | ||
2670 | return nf_conntrack_hash_resize(hashsize); | |
2671 | } | |
9fb9cbb1 | 2672 | |
f94161c1 | 2673 | int nf_conntrack_init_start(void) |
9fb9cbb1 | 2674 | { |
ca79b0c2 | 2675 | unsigned long nr_pages = totalram_pages(); |
f205c5e0 | 2676 | int max_factor = 8; |
0c5366b3 | 2677 | int ret = -ENOMEM; |
cc41c84b | 2678 | int i; |
93bb0ceb | 2679 | |
8201d923 AD |
2680 | seqcount_spinlock_init(&nf_conntrack_generation, |
2681 | &nf_conntrack_locks_all_lock); | |
a3efd812 | 2682 | |
d5d20912 | 2683 | for (i = 0; i < CONNTRACK_LOCKS; i++) |
93bb0ceb | 2684 | spin_lock_init(&nf_conntrack_locks[i]); |
9fb9cbb1 | 2685 | |
9fb9cbb1 YK |
2686 | if (!nf_conntrack_htable_size) { |
2687 | nf_conntrack_htable_size | |
3d6357de | 2688 | = (((nr_pages << PAGE_SHIFT) / 16384) |
f205c5e0 | 2689 | / sizeof(struct hlist_head)); |
d532bcd0 FW |
2690 | if (BITS_PER_LONG >= 64 && |
2691 | nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) | |
2692 | nf_conntrack_htable_size = 262144; | |
3d6357de | 2693 | else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) |
d532bcd0 FW |
2694 | nf_conntrack_htable_size = 65536; |
2695 | ||
2696 | if (nf_conntrack_htable_size < 1024) | |
2697 | nf_conntrack_htable_size = 1024; | |
2698 | /* Use a max. factor of one by default to keep the average | |
2699 | * hash chain length at 2 entries. Each entry has to be added | |
2700 | * twice (once for original direction, once for reply). | |
2701 | * When a table size is given we use the old value of 8 to | |
2702 | * avoid implicit reduction of the max entries setting. | |
2703 | */ | |
2704 | max_factor = 1; | |
9fb9cbb1 | 2705 | } |
56d52d48 FW |
2706 | |
2707 | nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); | |
2708 | if (!nf_conntrack_hash) | |
2709 | return -ENOMEM; | |
2710 | ||
f205c5e0 | 2711 | nf_conntrack_max = max_factor * nf_conntrack_htable_size; |
8e5105a0 | 2712 | |
0c5366b3 | 2713 | nf_conntrack_cachep = kmem_cache_create("nf_conntrack", |
a9e419dc FW |
2714 | sizeof(struct nf_conn), |
2715 | NFCT_INFOMASK + 1, | |
5f0d5a3a | 2716 | SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); |
0c5366b3 FW |
2717 | if (!nf_conntrack_cachep) |
2718 | goto err_cachep; | |
2719 | ||
83b4dbe1 G |
2720 | ret = nf_conntrack_expect_init(); |
2721 | if (ret < 0) | |
2722 | goto err_expect; | |
2723 | ||
5e615b22 G |
2724 | ret = nf_conntrack_helper_init(); |
2725 | if (ret < 0) | |
2726 | goto err_helper; | |
2727 | ||
04d87001 G |
2728 | ret = nf_conntrack_proto_init(); |
2729 | if (ret < 0) | |
2730 | goto err_proto; | |
2731 | ||
b87a2f91 | 2732 | conntrack_gc_work_init(&conntrack_gc_work); |
0984d427 | 2733 | queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); |
b87a2f91 | 2734 | |
b4c2b959 KKD |
2735 | ret = register_nf_conntrack_bpf(); |
2736 | if (ret < 0) | |
2737 | goto err_kfunc; | |
2738 | ||
08f6547d AD |
2739 | return 0; |
2740 | ||
b4c2b959 KKD |
2741 | err_kfunc: |
2742 | cancel_delayed_work_sync(&conntrack_gc_work.dwork); | |
2743 | nf_conntrack_proto_fini(); | |
04d87001 | 2744 | err_proto: |
5f69b8f5 | 2745 | nf_conntrack_helper_fini(); |
5e615b22 | 2746 | err_helper: |
b7ff3a1f | 2747 | nf_conntrack_expect_fini(); |
83b4dbe1 | 2748 | err_expect: |
0c5366b3 FW |
2749 | kmem_cache_destroy(nf_conntrack_cachep); |
2750 | err_cachep: | |
285189c7 | 2751 | kvfree(nf_conntrack_hash); |
08f6547d AD |
2752 | return ret; |
2753 | } | |
2754 | ||
2954fe60 FW |
2755 | static void nf_conntrack_set_closing(struct nf_conntrack *nfct) |
2756 | { | |
2757 | struct nf_conn *ct = nf_ct_to_nf_conn(nfct); | |
2758 | ||
2759 | switch (nf_ct_protonum(ct)) { | |
2760 | case IPPROTO_TCP: | |
2761 | nf_conntrack_tcp_set_closing(ct); | |
2762 | break; | |
2763 | } | |
2764 | } | |
2765 | ||
285c8a7a | 2766 | static const struct nf_ct_hook nf_conntrack_hook = { |
368982cd | 2767 | .update = nf_conntrack_update, |
6ae7989c | 2768 | .destroy = nf_ct_destroy, |
b60a6040 | 2769 | .get_tuple_skb = nf_conntrack_get_tuple_skb, |
3fce1649 | 2770 | .attach = nf_conntrack_attach, |
2954fe60 | 2771 | .set_closing = nf_conntrack_set_closing, |
1f4b2439 PNA |
2772 | }; |
2773 | ||
f94161c1 G |
2774 | void nf_conntrack_init_end(void) |
2775 | { | |
1f4b2439 | 2776 | RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook); |
f94161c1 G |
2777 | } |
2778 | ||
8cc20198 ED |
2779 | /* |
2780 | * We need to use special "null" values, not used in hash table | |
2781 | */ | |
2782 | #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) | |
8cc20198 | 2783 | |
f94161c1 | 2784 | int nf_conntrack_init_net(struct net *net) |
08f6547d | 2785 | { |
0418b989 | 2786 | struct nf_conntrack_net *cnet = nf_ct_pernet(net); |
b7779d06 | 2787 | int ret = -ENOMEM; |
ceceae1b | 2788 | |
cc41c84b | 2789 | BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); |
2e7b162c | 2790 | BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); |
c53bd0e9 | 2791 | atomic_set(&cnet->count, 0); |
b7779d06 | 2792 | |
b7779d06 JDB |
2793 | net->ct.stat = alloc_percpu(struct ip_conntrack_stat); |
2794 | if (!net->ct.stat) | |
8a75a2c1 | 2795 | return ret; |
b7779d06 | 2796 | |
83b4dbe1 | 2797 | ret = nf_conntrack_expect_pernet_init(net); |
08f6547d AD |
2798 | if (ret < 0) |
2799 | goto err_expect; | |
fc3893fd FW |
2800 | |
2801 | nf_conntrack_acct_pernet_init(net); | |
2802 | nf_conntrack_tstamp_pernet_init(net); | |
2803 | nf_conntrack_ecache_pernet_init(net); | |
4a60dc74 | 2804 | nf_conntrack_proto_pernet_init(net); |
fc3893fd | 2805 | |
08f6547d | 2806 | return 0; |
c539f017 | 2807 | |
08f6547d | 2808 | err_expect: |
0d55af87 | 2809 | free_percpu(net->ct.stat); |
08f6547d AD |
2810 | return ret; |
2811 | } | |
0b389236 | 2812 | |
0b389236 KKD |
2813 | /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ |
2814 | ||
2815 | int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout) | |
2816 | { | |
2817 | if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) | |
2818 | return -EPERM; | |
2819 | ||
2820 | __nf_ct_set_timeout(ct, timeout); | |
2821 | ||
2822 | if (test_bit(IPS_DYING_BIT, &ct->status)) | |
2823 | return -ETIME; | |
2824 | ||
2825 | return 0; | |
2826 | } | |
2827 | EXPORT_SYMBOL_GPL(__nf_ct_change_timeout); | |
2828 | ||
ef69aa3a LB |
2829 | void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off) |
2830 | { | |
2831 | unsigned int bit; | |
2832 | ||
2833 | /* Ignore these unchangable bits */ | |
2834 | on &= ~IPS_UNCHANGEABLE_MASK; | |
2835 | off &= ~IPS_UNCHANGEABLE_MASK; | |
2836 | ||
2837 | for (bit = 0; bit < __IPS_MAX_BIT; bit++) { | |
2838 | if (on & (1 << bit)) | |
2839 | set_bit(bit, &ct->status); | |
2840 | else if (off & (1 << bit)) | |
2841 | clear_bit(bit, &ct->status); | |
2842 | } | |
2843 | } | |
2844 | EXPORT_SYMBOL_GPL(__nf_ct_change_status); | |
2845 | ||
2846 | int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status) | |
2847 | { | |
2848 | unsigned long d; | |
2849 | ||
2850 | d = ct->status ^ status; | |
2851 | ||
2852 | if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) | |
2853 | /* unchangeable */ | |
2854 | return -EBUSY; | |
2855 | ||
2856 | if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) | |
2857 | /* SEEN_REPLY bit can only be set */ | |
2858 | return -EBUSY; | |
2859 | ||
2860 | if (d & IPS_ASSURED && !(status & IPS_ASSURED)) | |
2861 | /* ASSURED bit can only be set */ | |
2862 | return -EBUSY; | |
2863 | ||
2864 | __nf_ct_change_status(ct, status, 0); | |
2865 | return 0; | |
2866 | } | |
2867 | EXPORT_SYMBOL_GPL(nf_ct_change_status_common); |