Commit | Line | Data |
---|---|---|
d2912cb1 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
c7232c99 PM |
2 | /* |
3 | * (C) 1999-2001 Paul `Rusty' Russell | |
5b1158e9 | 4 | * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> |
c7232c99 | 5 | * (C) 2011 Patrick McHardy <kaber@trash.net> |
5b1158e9 JK |
6 | */ |
7 | ||
5191d70f AS |
8 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
9 | ||
5b1158e9 JK |
10 | #include <linux/module.h> |
11 | #include <linux/types.h> | |
12 | #include <linux/timer.h> | |
13 | #include <linux/skbuff.h> | |
5a0e3ad6 | 14 | #include <linux/gfp.h> |
c7232c99 | 15 | #include <net/xfrm.h> |
dd6d2910 | 16 | #include <linux/siphash.h> |
c7232c99 | 17 | #include <linux/rtnetlink.h> |
5b1158e9 | 18 | |
820dc052 | 19 | #include <net/netfilter/nf_conntrack_bpf.h> |
5b1158e9 | 20 | #include <net/netfilter/nf_conntrack_core.h> |
5b1158e9 | 21 | #include <net/netfilter/nf_conntrack_helper.h> |
41d73ec0 | 22 | #include <net/netfilter/nf_conntrack_seqadj.h> |
5d0aa2cc | 23 | #include <net/netfilter/nf_conntrack_zones.h> |
40d102cd JS |
24 | #include <net/netfilter/nf_nat.h> |
25 | #include <net/netfilter/nf_nat_helper.h> | |
26 | #include <uapi/linux/netfilter/nf_nat.h> | |
5b1158e9 | 27 | |
1cd472bf FW |
28 | #include "nf_internals.h" |
29 | ||
45897255 FW |
30 | #define NF_NAT_MAX_ATTEMPTS 128 |
31 | #define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4) | |
32 | ||
8073e960 | 33 | static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; |
e1bf1687 | 34 | |
c7232c99 | 35 | static DEFINE_MUTEX(nf_nat_proto_mutex); |
1cd472bf | 36 | static unsigned int nat_net_id __read_mostly; |
a76ae1c8 | 37 | |
e1bf1687 FW |
38 | static struct hlist_head *nf_nat_bysource __read_mostly; |
39 | static unsigned int nf_nat_htable_size __read_mostly; | |
49ecc2e9 | 40 | static siphash_aligned_key_t nf_nat_hash_rnd; |
c7232c99 | 41 | |
1cd472bf FW |
42 | struct nf_nat_lookup_hook_priv { |
43 | struct nf_hook_entries __rcu *entries; | |
44 | ||
45 | struct rcu_head rcu_head; | |
46 | }; | |
47 | ||
48 | struct nf_nat_hooks_net { | |
49 | struct nf_hook_ops *nat_hook_ops; | |
50 | unsigned int users; | |
51 | }; | |
52 | ||
53 | struct nat_net { | |
54 | struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO]; | |
55 | }; | |
56 | ||
c7232c99 | 57 | #ifdef CONFIG_XFRM |
096d0906 FW |
58 | static void nf_nat_ipv4_decode_session(struct sk_buff *skb, |
59 | const struct nf_conn *ct, | |
60 | enum ip_conntrack_dir dir, | |
61 | unsigned long statusbit, | |
62 | struct flowi *fl) | |
63 | { | |
64 | const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; | |
65 | struct flowi4 *fl4 = &fl->u.ip4; | |
66 | ||
67 | if (ct->status & statusbit) { | |
68 | fl4->daddr = t->dst.u3.ip; | |
69 | if (t->dst.protonum == IPPROTO_TCP || | |
70 | t->dst.protonum == IPPROTO_UDP || | |
71 | t->dst.protonum == IPPROTO_UDPLITE || | |
72 | t->dst.protonum == IPPROTO_DCCP || | |
73 | t->dst.protonum == IPPROTO_SCTP) | |
74 | fl4->fl4_dport = t->dst.u.all; | |
75 | } | |
76 | ||
77 | statusbit ^= IPS_NAT_MASK; | |
78 | ||
79 | if (ct->status & statusbit) { | |
80 | fl4->saddr = t->src.u3.ip; | |
81 | if (t->dst.protonum == IPPROTO_TCP || | |
82 | t->dst.protonum == IPPROTO_UDP || | |
83 | t->dst.protonum == IPPROTO_UDPLITE || | |
84 | t->dst.protonum == IPPROTO_DCCP || | |
85 | t->dst.protonum == IPPROTO_SCTP) | |
86 | fl4->fl4_sport = t->src.u.all; | |
87 | } | |
88 | } | |
89 | ||
90 | static void nf_nat_ipv6_decode_session(struct sk_buff *skb, | |
91 | const struct nf_conn *ct, | |
92 | enum ip_conntrack_dir dir, | |
93 | unsigned long statusbit, | |
94 | struct flowi *fl) | |
95 | { | |
96 | #if IS_ENABLED(CONFIG_IPV6) | |
97 | const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; | |
98 | struct flowi6 *fl6 = &fl->u.ip6; | |
99 | ||
100 | if (ct->status & statusbit) { | |
101 | fl6->daddr = t->dst.u3.in6; | |
102 | if (t->dst.protonum == IPPROTO_TCP || | |
103 | t->dst.protonum == IPPROTO_UDP || | |
104 | t->dst.protonum == IPPROTO_UDPLITE || | |
105 | t->dst.protonum == IPPROTO_DCCP || | |
106 | t->dst.protonum == IPPROTO_SCTP) | |
107 | fl6->fl6_dport = t->dst.u.all; | |
108 | } | |
109 | ||
110 | statusbit ^= IPS_NAT_MASK; | |
111 | ||
112 | if (ct->status & statusbit) { | |
113 | fl6->saddr = t->src.u3.in6; | |
114 | if (t->dst.protonum == IPPROTO_TCP || | |
115 | t->dst.protonum == IPPROTO_UDP || | |
116 | t->dst.protonum == IPPROTO_UDPLITE || | |
117 | t->dst.protonum == IPPROTO_DCCP || | |
118 | t->dst.protonum == IPPROTO_SCTP) | |
119 | fl6->fl6_sport = t->src.u.all; | |
120 | } | |
121 | #endif | |
122 | } | |
123 | ||
c7232c99 PM |
124 | static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) |
125 | { | |
c7232c99 PM |
126 | const struct nf_conn *ct; |
127 | enum ip_conntrack_info ctinfo; | |
128 | enum ip_conntrack_dir dir; | |
129 | unsigned long statusbit; | |
130 | u8 family; | |
131 | ||
132 | ct = nf_ct_get(skb, &ctinfo); | |
133 | if (ct == NULL) | |
134 | return; | |
135 | ||
53890234 | 136 | family = nf_ct_l3num(ct); |
c7232c99 PM |
137 | dir = CTINFO2DIR(ctinfo); |
138 | if (dir == IP_CT_DIR_ORIGINAL) | |
139 | statusbit = IPS_DST_NAT; | |
140 | else | |
141 | statusbit = IPS_SRC_NAT; | |
142 | ||
096d0906 FW |
143 | switch (family) { |
144 | case NFPROTO_IPV4: | |
145 | nf_nat_ipv4_decode_session(skb, ct, dir, statusbit, fl); | |
146 | return; | |
147 | case NFPROTO_IPV6: | |
148 | nf_nat_ipv6_decode_session(skb, ct, dir, statusbit, fl); | |
149 | return; | |
150 | } | |
c7232c99 | 151 | } |
c7232c99 PM |
152 | #endif /* CONFIG_XFRM */ |
153 | ||
e1bf1687 FW |
154 | /* We keep an extra hash for each conntrack, for fast searching. */ |
155 | static unsigned int | |
d2966dc7 FW |
156 | hash_by_src(const struct net *net, |
157 | const struct nf_conntrack_zone *zone, | |
158 | const struct nf_conntrack_tuple *tuple) | |
5b1158e9 | 159 | { |
e1bf1687 | 160 | unsigned int hash; |
dd6d2910 FW |
161 | struct { |
162 | struct nf_conntrack_man src; | |
163 | u32 net_mix; | |
164 | u32 protonum; | |
d2966dc7 | 165 | u32 zone; |
dd6d2910 | 166 | } __aligned(SIPHASH_ALIGNMENT) combined; |
e1bf1687 FW |
167 | |
168 | get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); | |
7001c6d1 | 169 | |
dd6d2910 FW |
170 | memset(&combined, 0, sizeof(combined)); |
171 | ||
5b1158e9 | 172 | /* Original src, to ensure we map it consistently if poss. */ |
dd6d2910 | 173 | combined.src = tuple->src; |
d2966dc7 | 174 | combined.net_mix = net_hash_mix(net); |
dd6d2910 FW |
175 | combined.protonum = tuple->dst.protonum; |
176 | ||
d2966dc7 FW |
177 | /* Zone ID can be used provided its valid for both directions */ |
178 | if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) | |
179 | combined.zone = zone->id; | |
180 | ||
dd6d2910 | 181 | hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); |
8fc54f68 | 182 | |
e1bf1687 | 183 | return reciprocal_scale(hash, nf_nat_htable_size); |
5b1158e9 JK |
184 | } |
185 | ||
5b1158e9 | 186 | /* Is this tuple already taken? (not by us) */ |
472caa69 | 187 | static int |
5b1158e9 JK |
188 | nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, |
189 | const struct nf_conn *ignored_conntrack) | |
190 | { | |
191 | /* Conntrack tracking doesn't keep track of outgoing tuples; only | |
c7232c99 PM |
192 | * incoming ones. NAT means they don't have a fixed mapping, |
193 | * so we invert the tuple and look for the incoming reply. | |
194 | * | |
195 | * We could keep a separate hash if this proves too slow. | |
196 | */ | |
5b1158e9 JK |
197 | struct nf_conntrack_tuple reply; |
198 | ||
303e0c55 | 199 | nf_ct_invert_tuple(&reply, tuple); |
5b1158e9 JK |
200 | return nf_conntrack_tuple_taken(&reply, ignored_conntrack); |
201 | } | |
5b1158e9 | 202 | |
45897255 FW |
203 | static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) |
204 | { | |
205 | static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT | | |
206 | IPS_DYING; | |
207 | static const unsigned long flags_needed = IPS_SRC_NAT; | |
208 | enum tcp_conntrack old_state; | |
209 | ||
210 | old_state = READ_ONCE(ct->proto.tcp.state); | |
211 | if (old_state < TCP_CONNTRACK_TIME_WAIT) | |
212 | return false; | |
213 | ||
214 | if (flags & flags_refuse) | |
215 | return false; | |
216 | ||
217 | return (flags & flags_needed) == flags_needed; | |
218 | } | |
219 | ||
220 | /* reverse direction will send packets to new source, so | |
221 | * make sure such packets are invalid. | |
222 | */ | |
223 | static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new) | |
224 | { | |
225 | return (__s32)(new->proto.tcp.seen[0].td_end - | |
226 | old->proto.tcp.seen[0].td_end) > 0; | |
227 | } | |
228 | ||
229 | static int | |
230 | nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple, | |
231 | const struct nf_conn *ignored_conntrack, | |
232 | unsigned int attempts_left) | |
233 | { | |
234 | static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD; | |
235 | struct nf_conntrack_tuple_hash *thash; | |
236 | const struct nf_conntrack_zone *zone; | |
237 | struct nf_conntrack_tuple reply; | |
238 | unsigned long flags; | |
239 | struct nf_conn *ct; | |
240 | bool taken = true; | |
241 | struct net *net; | |
242 | ||
243 | nf_ct_invert_tuple(&reply, tuple); | |
244 | ||
245 | if (attempts_left > NF_NAT_HARDER_THRESH || | |
246 | tuple->dst.protonum != IPPROTO_TCP || | |
247 | ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT) | |
248 | return nf_conntrack_tuple_taken(&reply, ignored_conntrack); | |
249 | ||
250 | /* :ast few attempts to find a free tcp port. Destructive | |
251 | * action: evict colliding if its in timewait state and the | |
252 | * tcp sequence number has advanced past the one used by the | |
253 | * old entry. | |
254 | */ | |
255 | net = nf_ct_net(ignored_conntrack); | |
256 | zone = nf_ct_zone(ignored_conntrack); | |
257 | ||
258 | thash = nf_conntrack_find_get(net, zone, &reply); | |
259 | if (!thash) | |
260 | return false; | |
261 | ||
262 | ct = nf_ct_tuplehash_to_ctrack(thash); | |
263 | ||
264 | if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL) | |
265 | goto out; | |
266 | ||
267 | if (WARN_ON_ONCE(ct == ignored_conntrack)) | |
268 | goto out; | |
269 | ||
270 | flags = READ_ONCE(ct->status); | |
271 | if (!nf_nat_may_kill(ct, flags)) | |
272 | goto out; | |
273 | ||
274 | if (!nf_seq_has_advanced(ct, ignored_conntrack)) | |
275 | goto out; | |
276 | ||
277 | /* Even if we can evict do not reuse if entry is offloaded. */ | |
278 | if (nf_ct_kill(ct)) | |
279 | taken = flags & flags_offload; | |
280 | out: | |
281 | nf_ct_put(ct); | |
282 | return taken; | |
283 | } | |
284 | ||
40e786bd FW |
285 | static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, |
286 | const struct nf_nat_range2 *range) | |
287 | { | |
288 | if (t->src.l3num == NFPROTO_IPV4) | |
289 | return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && | |
290 | ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); | |
291 | ||
292 | return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && | |
293 | ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; | |
294 | } | |
295 | ||
fe2d0020 FW |
296 | /* Is the manipable part of the tuple between min and max incl? */ |
297 | static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, | |
298 | enum nf_nat_manip_type maniptype, | |
299 | const union nf_conntrack_man_proto *min, | |
300 | const union nf_conntrack_man_proto *max) | |
301 | { | |
302 | __be16 port; | |
303 | ||
304 | switch (tuple->dst.protonum) { | |
35acfbab | 305 | case IPPROTO_ICMP: |
fe2d0020 FW |
306 | case IPPROTO_ICMPV6: |
307 | return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && | |
308 | ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); | |
309 | case IPPROTO_GRE: /* all fall though */ | |
310 | case IPPROTO_TCP: | |
311 | case IPPROTO_UDP: | |
312 | case IPPROTO_UDPLITE: | |
313 | case IPPROTO_DCCP: | |
314 | case IPPROTO_SCTP: | |
315 | if (maniptype == NF_NAT_MANIP_SRC) | |
316 | port = tuple->src.u.all; | |
317 | else | |
318 | port = tuple->dst.u.all; | |
319 | ||
320 | return ntohs(port) >= ntohs(min->all) && | |
321 | ntohs(port) <= ntohs(max->all); | |
322 | default: | |
323 | return true; | |
324 | } | |
325 | } | |
326 | ||
5b1158e9 | 327 | /* If we source map this tuple so reply looks like reply_tuple, will |
c7232c99 PM |
328 | * that meet the constraints of range. |
329 | */ | |
f9bff0e3 | 330 | static int nf_in_range(const struct nf_conntrack_tuple *tuple, |
2eb0f624 | 331 | const struct nf_nat_range2 *range) |
5b1158e9 | 332 | { |
5b1158e9 | 333 | /* If we are supposed to map IPs, then we must be in the |
c7232c99 PM |
334 | * range specified, otherwise let this drag us onto a new src IP. |
335 | */ | |
336 | if (range->flags & NF_NAT_RANGE_MAP_IPS && | |
40e786bd | 337 | !nf_nat_inet_in_range(tuple, range)) |
c7232c99 | 338 | return 0; |
5b1158e9 | 339 | |
fe2d0020 | 340 | if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) |
c7232c99 | 341 | return 1; |
5b1158e9 | 342 | |
fe2d0020 FW |
343 | return l4proto_in_range(tuple, NF_NAT_MANIP_SRC, |
344 | &range->min_proto, &range->max_proto); | |
5b1158e9 JK |
345 | } |
346 | ||
347 | static inline int | |
348 | same_src(const struct nf_conn *ct, | |
349 | const struct nf_conntrack_tuple *tuple) | |
350 | { | |
351 | const struct nf_conntrack_tuple *t; | |
352 | ||
353 | t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; | |
354 | return (t->dst.protonum == tuple->dst.protonum && | |
c7232c99 | 355 | nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) && |
5b1158e9 JK |
356 | t->src.u.all == tuple->src.u.all); |
357 | } | |
358 | ||
359 | /* Only called for SRC manip */ | |
360 | static int | |
308ac914 DB |
361 | find_appropriate_src(struct net *net, |
362 | const struct nf_conntrack_zone *zone, | |
0c4c9288 | 363 | const struct nf_conntrack_tuple *tuple, |
5b1158e9 | 364 | struct nf_conntrack_tuple *result, |
2eb0f624 | 365 | const struct nf_nat_range2 *range) |
5b1158e9 | 366 | { |
d2966dc7 | 367 | unsigned int h = hash_by_src(net, zone, tuple); |
72b72949 | 368 | const struct nf_conn *ct; |
870190a9 | 369 | |
e1bf1687 FW |
370 | hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { |
371 | if (same_src(ct, tuple) && | |
372 | net_eq(net, nf_ct_net(ct)) && | |
373 | nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { | |
374 | /* Copy source part from reply tuple. */ | |
303e0c55 | 375 | nf_ct_invert_tuple(result, |
e1bf1687 FW |
376 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple); |
377 | result->dst = tuple->dst; | |
378 | ||
f9bff0e3 | 379 | if (nf_in_range(result, range)) |
e1bf1687 FW |
380 | return 1; |
381 | } | |
97772bcd | 382 | } |
97772bcd | 383 | return 0; |
5b1158e9 JK |
384 | } |
385 | ||
386 | /* For [FUTURE] fragmentation handling, we want the least-used | |
c7232c99 PM |
387 | * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus |
388 | * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports | |
389 | * 1-65535, we don't do pro-rata allocation based on ports; we choose | |
390 | * the ip with the lowest src-ip/dst-ip/proto usage. | |
391 | */ | |
5b1158e9 | 392 | static void |
308ac914 DB |
393 | find_best_ips_proto(const struct nf_conntrack_zone *zone, |
394 | struct nf_conntrack_tuple *tuple, | |
2eb0f624 | 395 | const struct nf_nat_range2 *range, |
5b1158e9 JK |
396 | const struct nf_conn *ct, |
397 | enum nf_nat_manip_type maniptype) | |
398 | { | |
c7232c99 PM |
399 | union nf_inet_addr *var_ipp; |
400 | unsigned int i, max; | |
5b1158e9 | 401 | /* Host order */ |
c7232c99 PM |
402 | u32 minip, maxip, j, dist; |
403 | bool full_range; | |
5b1158e9 JK |
404 | |
405 | /* No IP mapping? Do nothing. */ | |
cbc9f2f4 | 406 | if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) |
5b1158e9 JK |
407 | return; |
408 | ||
cbc9f2f4 | 409 | if (maniptype == NF_NAT_MANIP_SRC) |
c7232c99 | 410 | var_ipp = &tuple->src.u3; |
5b1158e9 | 411 | else |
c7232c99 | 412 | var_ipp = &tuple->dst.u3; |
5b1158e9 JK |
413 | |
414 | /* Fast path: only one choice. */ | |
c7232c99 PM |
415 | if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) { |
416 | *var_ipp = range->min_addr; | |
5b1158e9 JK |
417 | return; |
418 | } | |
419 | ||
c7232c99 PM |
420 | if (nf_ct_l3num(ct) == NFPROTO_IPV4) |
421 | max = sizeof(var_ipp->ip) / sizeof(u32) - 1; | |
422 | else | |
423 | max = sizeof(var_ipp->ip6) / sizeof(u32) - 1; | |
424 | ||
5b1158e9 JK |
425 | /* Hashing source and destination IPs gives a fairly even |
426 | * spread in practice (if there are a small number of IPs | |
427 | * involved, there usually aren't that many connections | |
428 | * anyway). The consistency means that servers see the same | |
429 | * client coming from the same IP (some Internet Banking sites | |
c7232c99 PM |
430 | * like this), even across reboots. |
431 | */ | |
5693d68d | 432 | j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), |
c7232c99 | 433 | range->flags & NF_NAT_RANGE_PERSISTENT ? |
308ac914 | 434 | 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id); |
c7232c99 PM |
435 | |
436 | full_range = false; | |
437 | for (i = 0; i <= max; i++) { | |
438 | /* If first bytes of the address are at the maximum, use the | |
439 | * distance. Otherwise use the full range. | |
440 | */ | |
441 | if (!full_range) { | |
442 | minip = ntohl((__force __be32)range->min_addr.all[i]); | |
443 | maxip = ntohl((__force __be32)range->max_addr.all[i]); | |
444 | dist = maxip - minip + 1; | |
445 | } else { | |
446 | minip = 0; | |
447 | dist = ~0; | |
448 | } | |
449 | ||
450 | var_ipp->all[i] = (__force __u32) | |
8fc54f68 | 451 | htonl(minip + reciprocal_scale(j, dist)); |
c7232c99 PM |
452 | if (var_ipp->all[i] != range->max_addr.all[i]) |
453 | full_range = true; | |
454 | ||
455 | if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) | |
456 | j ^= (__force u32)tuple->dst.u3.all[i]; | |
457 | } | |
5b1158e9 JK |
458 | } |
459 | ||
203f2e78 FW |
460 | /* Alter the per-proto part of the tuple (depending on maniptype), to |
461 | * give a unique tuple in the given range if possible. | |
462 | * | |
463 | * Per-protocol part of tuple is initialized to the incoming packet. | |
464 | */ | |
716b23c1 FW |
465 | static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, |
466 | const struct nf_nat_range2 *range, | |
467 | enum nf_nat_manip_type maniptype, | |
468 | const struct nf_conn *ct) | |
469 | { | |
470 | unsigned int range_size, min, max, i, attempts; | |
203f2e78 | 471 | __be16 *keyptr; |
716b23c1 | 472 | u16 off; |
716b23c1 | 473 | |
203f2e78 | 474 | switch (tuple->dst.protonum) { |
954d8297 | 475 | case IPPROTO_ICMP: |
203f2e78 FW |
476 | case IPPROTO_ICMPV6: |
477 | /* id is same for either direction... */ | |
478 | keyptr = &tuple->src.u.icmp.id; | |
5bdac418 FW |
479 | if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { |
480 | min = 0; | |
481 | range_size = 65536; | |
482 | } else { | |
483 | min = ntohs(range->min_proto.icmp.id); | |
484 | range_size = ntohs(range->max_proto.icmp.id) - | |
485 | ntohs(range->min_proto.icmp.id) + 1; | |
486 | } | |
203f2e78 FW |
487 | goto find_free_id; |
488 | #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) | |
489 | case IPPROTO_GRE: | |
490 | /* If there is no master conntrack we are not PPTP, | |
491 | do not change tuples */ | |
492 | if (!ct->master) | |
493 | return; | |
494 | ||
495 | if (maniptype == NF_NAT_MANIP_SRC) | |
496 | keyptr = &tuple->src.u.gre.key; | |
497 | else | |
498 | keyptr = &tuple->dst.u.gre.key; | |
499 | ||
500 | if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { | |
501 | min = 1; | |
502 | range_size = 65535; | |
503 | } else { | |
504 | min = ntohs(range->min_proto.gre.key); | |
505 | range_size = ntohs(range->max_proto.gre.key) - min + 1; | |
506 | } | |
507 | goto find_free_id; | |
508 | #endif | |
954d8297 GS |
509 | case IPPROTO_UDP: |
510 | case IPPROTO_UDPLITE: | |
511 | case IPPROTO_TCP: | |
512 | case IPPROTO_SCTP: | |
513 | case IPPROTO_DCCP: | |
203f2e78 FW |
514 | if (maniptype == NF_NAT_MANIP_SRC) |
515 | keyptr = &tuple->src.u.all; | |
516 | else | |
517 | keyptr = &tuple->dst.u.all; | |
518 | ||
519 | break; | |
520 | default: | |
521 | return; | |
522 | } | |
716b23c1 FW |
523 | |
524 | /* If no range specified... */ | |
525 | if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { | |
526 | /* If it's dst rewrite, can't change port */ | |
527 | if (maniptype == NF_NAT_MANIP_DST) | |
528 | return; | |
529 | ||
203f2e78 | 530 | if (ntohs(*keyptr) < 1024) { |
716b23c1 | 531 | /* Loose convention: >> 512 is credential passing */ |
203f2e78 | 532 | if (ntohs(*keyptr) < 512) { |
716b23c1 FW |
533 | min = 1; |
534 | range_size = 511 - min + 1; | |
535 | } else { | |
536 | min = 600; | |
537 | range_size = 1023 - min + 1; | |
538 | } | |
539 | } else { | |
540 | min = 1024; | |
541 | range_size = 65535 - 1024 + 1; | |
542 | } | |
543 | } else { | |
544 | min = ntohs(range->min_proto.all); | |
545 | max = ntohs(range->max_proto.all); | |
546 | if (unlikely(max < min)) | |
547 | swap(max, min); | |
548 | range_size = max - min + 1; | |
549 | } | |
550 | ||
203f2e78 | 551 | find_free_id: |
716b23c1 | 552 | if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) |
203f2e78 | 553 | off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); |
0f1ae282 KS |
554 | else if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL) || |
555 | maniptype != NF_NAT_MANIP_DST) | |
7e3cf084 | 556 | off = get_random_u16(); |
0f1ae282 KS |
557 | else |
558 | off = 0; | |
716b23c1 FW |
559 | |
560 | attempts = range_size; | |
45897255 FW |
561 | if (attempts > NF_NAT_MAX_ATTEMPTS) |
562 | attempts = NF_NAT_MAX_ATTEMPTS; | |
716b23c1 FW |
563 | |
564 | /* We are in softirq; doing a search of the entire range risks | |
565 | * soft lockup when all tuples are already used. | |
566 | * | |
567 | * If we can't find any free port from first offset, pick a new | |
568 | * one and try again, with ever smaller search window. | |
569 | */ | |
570 | another_round: | |
571 | for (i = 0; i < attempts; i++, off++) { | |
203f2e78 | 572 | *keyptr = htons(min + off % range_size); |
45897255 | 573 | if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i)) |
716b23c1 FW |
574 | return; |
575 | } | |
576 | ||
577 | if (attempts >= range_size || attempts < 16) | |
578 | return; | |
579 | attempts /= 2; | |
7e3cf084 | 580 | off = get_random_u16(); |
716b23c1 FW |
581 | goto another_round; |
582 | } | |
583 | ||
c7232c99 PM |
584 | /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, |
585 | * we change the source to map into the range. For NF_INET_PRE_ROUTING | |
6e23ae2a | 586 | * and NF_INET_LOCAL_OUT, we change the destination to map into the |
c7232c99 | 587 | * range. It might not be possible to get a unique tuple, but we try. |
5b1158e9 | 588 | * At worst (or if we race), we will end up with a final duplicate in |
05ba4c89 | 589 | * __nf_conntrack_confirm and drop the packet. */ |
5b1158e9 JK |
590 | static void |
591 | get_unique_tuple(struct nf_conntrack_tuple *tuple, | |
592 | const struct nf_conntrack_tuple *orig_tuple, | |
2eb0f624 | 593 | const struct nf_nat_range2 *range, |
5b1158e9 JK |
594 | struct nf_conn *ct, |
595 | enum nf_nat_manip_type maniptype) | |
596 | { | |
308ac914 | 597 | const struct nf_conntrack_zone *zone; |
0c4c9288 | 598 | struct net *net = nf_ct_net(ct); |
308ac914 DB |
599 | |
600 | zone = nf_ct_zone(ct); | |
5b1158e9 | 601 | |
c7232c99 PM |
602 | /* 1) If this srcip/proto/src-proto-part is currently mapped, |
603 | * and that same mapping gives a unique tuple within the given | |
604 | * range, use that. | |
605 | * | |
606 | * This is only required for source (ie. NAT/masq) mappings. | |
607 | * So far, we don't do local source mappings, so multiple | |
608 | * manips not an issue. | |
609 | */ | |
a82c25c3 FW |
610 | if (maniptype == NF_NAT_MANIP_SRC && |
611 | !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { | |
41a7cab6 | 612 | /* try the original tuple first */ |
f9bff0e3 | 613 | if (nf_in_range(orig_tuple, range)) { |
41a7cab6 CG |
614 | if (!nf_nat_used_tuple(orig_tuple, ct)) { |
615 | *tuple = *orig_tuple; | |
fe2d0020 | 616 | return; |
41a7cab6 | 617 | } |
fe2d0020 | 618 | } else if (find_appropriate_src(net, zone, |
c7232c99 | 619 | orig_tuple, tuple, range)) { |
0d53778e | 620 | pr_debug("get_unique_tuple: Found current src map\n"); |
0dbff689 | 621 | if (!nf_nat_used_tuple(tuple, ct)) |
fe2d0020 | 622 | return; |
5b1158e9 JK |
623 | } |
624 | } | |
625 | ||
c7232c99 | 626 | /* 2) Select the least-used IP/proto combination in the given range */ |
5b1158e9 | 627 | *tuple = *orig_tuple; |
5d0aa2cc | 628 | find_best_ips_proto(zone, tuple, range, ct, maniptype); |
5b1158e9 JK |
629 | |
630 | /* 3) The per-protocol part of the manip is made to map into | |
c7232c99 PM |
631 | * the range to make a unique tuple. |
632 | */ | |
5b1158e9 JK |
633 | |
634 | /* Only bother mapping if it's not already in range and unique */ | |
a82c25c3 | 635 | if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { |
cbc9f2f4 | 636 | if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { |
2eb0f624 | 637 | if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && |
fe2d0020 | 638 | l4proto_in_range(tuple, maniptype, |
b0ca2000 JS |
639 | &range->min_proto, |
640 | &range->max_proto) && | |
c7232c99 | 641 | (range->min_proto.all == range->max_proto.all || |
99ad3c53 | 642 | !nf_nat_used_tuple(tuple, ct))) |
fe2d0020 | 643 | return; |
99ad3c53 | 644 | } else if (!nf_nat_used_tuple(tuple, ct)) { |
fe2d0020 | 645 | return; |
99ad3c53 CG |
646 | } |
647 | } | |
5b1158e9 | 648 | |
2eb0f624 | 649 | /* Last chance: get protocol to try to obtain unique tuple. */ |
203f2e78 | 650 | nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct); |
5b1158e9 JK |
651 | } |
652 | ||
f768e5bd FW |
653 | struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) |
654 | { | |
655 | struct nf_conn_nat *nat = nfct_nat(ct); | |
656 | if (nat) | |
657 | return nat; | |
658 | ||
659 | if (!nf_ct_is_confirmed(ct)) | |
660 | nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); | |
661 | ||
662 | return nat; | |
663 | } | |
664 | EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); | |
665 | ||
5b1158e9 JK |
666 | unsigned int |
667 | nf_nat_setup_info(struct nf_conn *ct, | |
2eb0f624 | 668 | const struct nf_nat_range2 *range, |
cc01dcbd | 669 | enum nf_nat_manip_type maniptype) |
5b1158e9 | 670 | { |
e1bf1687 | 671 | struct net *net = nf_ct_net(ct); |
5b1158e9 | 672 | struct nf_conntrack_tuple curr_tuple, new_tuple; |
2d59e5ca | 673 | |
d110a394 LZ |
674 | /* Can't setup nat info for confirmed ct. */ |
675 | if (nf_ct_is_confirmed(ct)) | |
676 | return NF_ACCEPT; | |
677 | ||
44d6e2f2 VR |
678 | WARN_ON(maniptype != NF_NAT_MANIP_SRC && |
679 | maniptype != NF_NAT_MANIP_DST); | |
75c26314 FW |
680 | |
681 | if (WARN_ON(nf_nat_initialized(ct, maniptype))) | |
682 | return NF_DROP; | |
5b1158e9 JK |
683 | |
684 | /* What we've got will look like inverse of reply. Normally | |
c7232c99 PM |
685 | * this is what is in the conntrack, except for prior |
686 | * manipulations (future optimization: if num_manips == 0, | |
687 | * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) | |
688 | */ | |
303e0c55 FW |
689 | nf_ct_invert_tuple(&curr_tuple, |
690 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple); | |
5b1158e9 JK |
691 | |
692 | get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); | |
693 | ||
694 | if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { | |
695 | struct nf_conntrack_tuple reply; | |
696 | ||
697 | /* Alter conntrack table so will recognize replies. */ | |
303e0c55 | 698 | nf_ct_invert_tuple(&reply, &new_tuple); |
5b1158e9 JK |
699 | nf_conntrack_alter_reply(ct, &reply); |
700 | ||
701 | /* Non-atomic: we own this at the moment. */ | |
cbc9f2f4 | 702 | if (maniptype == NF_NAT_MANIP_SRC) |
5b1158e9 JK |
703 | ct->status |= IPS_SRC_NAT; |
704 | else | |
705 | ct->status |= IPS_DST_NAT; | |
41d73ec0 | 706 | |
ab6dd1be | 707 | if (nfct_help(ct) && !nfct_seqadj(ct)) |
4440a2ab GF |
708 | if (!nfct_seqadj_ext_add(ct)) |
709 | return NF_DROP; | |
5b1158e9 JK |
710 | } |
711 | ||
cbc9f2f4 | 712 | if (maniptype == NF_NAT_MANIP_SRC) { |
e1bf1687 | 713 | unsigned int srchash; |
8073e960 | 714 | spinlock_t *lock; |
e1bf1687 | 715 | |
d2966dc7 | 716 | srchash = hash_by_src(net, nf_ct_zone(ct), |
e1bf1687 | 717 | &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); |
b0ade851 | 718 | lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; |
8073e960 | 719 | spin_lock_bh(lock); |
e1bf1687 FW |
720 | hlist_add_head_rcu(&ct->nat_bysource, |
721 | &nf_nat_bysource[srchash]); | |
8073e960 | 722 | spin_unlock_bh(lock); |
5b1158e9 JK |
723 | } |
724 | ||
725 | /* It's done. */ | |
cbc9f2f4 | 726 | if (maniptype == NF_NAT_MANIP_DST) |
a7c2f4d7 | 727 | ct->status |= IPS_DST_NAT_DONE; |
5b1158e9 | 728 | else |
a7c2f4d7 | 729 | ct->status |= IPS_SRC_NAT_DONE; |
5b1158e9 JK |
730 | |
731 | return NF_ACCEPT; | |
732 | } | |
733 | EXPORT_SYMBOL(nf_nat_setup_info); | |
734 | ||
0eba801b PNA |
735 | static unsigned int |
736 | __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) | |
f59cb045 PNA |
737 | { |
738 | /* Force range to this IP; let proto decide mapping for | |
739 | * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). | |
740 | * Use reply in case it's already been mangled (eg local packet). | |
741 | */ | |
742 | union nf_inet_addr ip = | |
0eba801b | 743 | (manip == NF_NAT_MANIP_SRC ? |
f59cb045 PNA |
744 | ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : |
745 | ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); | |
2eb0f624 | 746 | struct nf_nat_range2 range = { |
f59cb045 PNA |
747 | .flags = NF_NAT_RANGE_MAP_IPS, |
748 | .min_addr = ip, | |
749 | .max_addr = ip, | |
750 | }; | |
0eba801b PNA |
751 | return nf_nat_setup_info(ct, &range, manip); |
752 | } | |
753 | ||
754 | unsigned int | |
755 | nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) | |
756 | { | |
757 | return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); | |
f59cb045 PNA |
758 | } |
759 | EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); | |
760 | ||
5b1158e9 JK |
761 | /* Do packet manipulations according to nf_nat_setup_info. */ |
762 | unsigned int nf_nat_packet(struct nf_conn *ct, | |
763 | enum ip_conntrack_info ctinfo, | |
764 | unsigned int hooknum, | |
3db05fea | 765 | struct sk_buff *skb) |
5b1158e9 | 766 | { |
368982cd | 767 | enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); |
5b1158e9 | 768 | enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); |
368982cd | 769 | unsigned int verdict = NF_ACCEPT; |
5b1158e9 | 770 | unsigned long statusbit; |
5b1158e9 | 771 | |
cbc9f2f4 | 772 | if (mtype == NF_NAT_MANIP_SRC) |
5b1158e9 JK |
773 | statusbit = IPS_SRC_NAT; |
774 | else | |
775 | statusbit = IPS_DST_NAT; | |
776 | ||
777 | /* Invert if this is reply dir. */ | |
778 | if (dir == IP_CT_DIR_REPLY) | |
779 | statusbit ^= IPS_NAT_MASK; | |
780 | ||
781 | /* Non-atomic: these bits don't change. */ | |
368982cd PNA |
782 | if (ct->status & statusbit) |
783 | verdict = nf_nat_manip_pkt(skb, ct, mtype, dir); | |
5b1158e9 | 784 | |
368982cd | 785 | return verdict; |
5b1158e9 JK |
786 | } |
787 | EXPORT_SYMBOL_GPL(nf_nat_packet); | |
788 | ||
8e0538d8 FW |
789 | static bool in_vrf_postrouting(const struct nf_hook_state *state) |
790 | { | |
791 | #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) | |
792 | if (state->hook == NF_INET_POST_ROUTING && | |
793 | netif_is_l3_master(state->out)) | |
794 | return true; | |
795 | #endif | |
796 | return false; | |
797 | } | |
798 | ||
1f55236b FW |
799 | unsigned int |
800 | nf_nat_inet_fn(void *priv, struct sk_buff *skb, | |
9971a514 | 801 | const struct nf_hook_state *state) |
1f55236b FW |
802 | { |
803 | struct nf_conn *ct; | |
804 | enum ip_conntrack_info ctinfo; | |
805 | struct nf_conn_nat *nat; | |
806 | /* maniptype == SRC for postrouting. */ | |
807 | enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); | |
808 | ||
809 | ct = nf_ct_get(skb, &ctinfo); | |
810 | /* Can't track? It's not due to stress, or conntrack would | |
811 | * have dropped it. Hence it's the user's responsibilty to | |
812 | * packet filter it out, or implement conntrack/NAT for that | |
813 | * protocol. 8) --RR | |
814 | */ | |
8e0538d8 | 815 | if (!ct || in_vrf_postrouting(state)) |
1f55236b FW |
816 | return NF_ACCEPT; |
817 | ||
818 | nat = nfct_nat(ct); | |
819 | ||
820 | switch (ctinfo) { | |
821 | case IP_CT_RELATED: | |
822 | case IP_CT_RELATED_REPLY: | |
823 | /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */ | |
824 | case IP_CT_NEW: | |
825 | /* Seen it before? This can happen for loopback, retrans, | |
826 | * or local packets. | |
827 | */ | |
828 | if (!nf_nat_initialized(ct, maniptype)) { | |
9971a514 FW |
829 | struct nf_nat_lookup_hook_priv *lpriv = priv; |
830 | struct nf_hook_entries *e = rcu_dereference(lpriv->entries); | |
1f55236b | 831 | unsigned int ret; |
9971a514 FW |
832 | int i; |
833 | ||
834 | if (!e) | |
835 | goto null_bind; | |
836 | ||
837 | for (i = 0; i < e->num_hook_entries; i++) { | |
838 | ret = e->hooks[i].hook(e->hooks[i].priv, skb, | |
839 | state); | |
840 | if (ret != NF_ACCEPT) | |
841 | return ret; | |
842 | if (nf_nat_initialized(ct, maniptype)) | |
843 | goto do_nat; | |
844 | } | |
845 | null_bind: | |
1f55236b FW |
846 | ret = nf_nat_alloc_null_binding(ct, state->hook); |
847 | if (ret != NF_ACCEPT) | |
848 | return ret; | |
849 | } else { | |
850 | pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n", | |
851 | maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", | |
852 | ct, ct->status); | |
853 | if (nf_nat_oif_changed(state->hook, ctinfo, nat, | |
854 | state->out)) | |
855 | goto oif_changed; | |
856 | } | |
857 | break; | |
858 | default: | |
859 | /* ESTABLISHED */ | |
860 | WARN_ON(ctinfo != IP_CT_ESTABLISHED && | |
861 | ctinfo != IP_CT_ESTABLISHED_REPLY); | |
862 | if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) | |
863 | goto oif_changed; | |
864 | } | |
9971a514 | 865 | do_nat: |
1f55236b FW |
866 | return nf_nat_packet(ct, ctinfo, state->hook, skb); |
867 | ||
868 | oif_changed: | |
869 | nf_ct_kill_acct(ct, ctinfo, skb); | |
870 | return NF_DROP; | |
871 | } | |
872 | EXPORT_SYMBOL_GPL(nf_nat_inet_fn); | |
873 | ||
c7232c99 PM |
874 | struct nf_nat_proto_clean { |
875 | u8 l3proto; | |
876 | u8 l4proto; | |
c7232c99 PM |
877 | }; |
878 | ||
c2d421e1 FW |
879 | /* kill conntracks with affected NAT section */ |
880 | static int nf_nat_proto_remove(struct nf_conn *i, void *data) | |
5b1158e9 | 881 | { |
c7232c99 | 882 | const struct nf_nat_proto_clean *clean = data; |
c2d421e1 | 883 | |
c7232c99 PM |
884 | if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || |
885 | (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) | |
5b1158e9 JK |
886 | return 0; |
887 | ||
c2d421e1 | 888 | return i->status & IPS_NAT_MASK ? 1 : 0; |
c7232c99 | 889 | } |
5b1158e9 | 890 | |
1bc91a5d | 891 | static void nf_nat_cleanup_conntrack(struct nf_conn *ct) |
8073e960 FW |
892 | { |
893 | unsigned int h; | |
894 | ||
d2966dc7 | 895 | h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); |
b0ade851 | 896 | spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); |
8073e960 | 897 | hlist_del_rcu(&ct->nat_bysource); |
b0ade851 | 898 | spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); |
8073e960 FW |
899 | } |
900 | ||
945b2b2d FW |
901 | static int nf_nat_proto_clean(struct nf_conn *ct, void *data) |
902 | { | |
945b2b2d FW |
903 | if (nf_nat_proto_remove(ct, data)) |
904 | return 1; | |
905 | ||
2420770b | 906 | /* This module is being removed and conntrack has nat null binding. |
945b2b2d FW |
907 | * Remove it from bysource hash, as the table will be freed soon. |
908 | * | |
909 | * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() | |
910 | * will delete entry from already-freed table. | |
911 | */ | |
2420770b | 912 | if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status)) |
1bc91a5d | 913 | nf_nat_cleanup_conntrack(ct); |
945b2b2d | 914 | |
945b2b2d FW |
915 | /* don't delete conntrack. Although that would make things a lot |
916 | * simpler, we'd end up flushing all conntracks on nat rmmod. | |
917 | */ | |
918 | return 0; | |
919 | } | |
920 | ||
24de3d37 | 921 | #if IS_ENABLED(CONFIG_NF_CT_NETLINK) |
e6a7d3c0 PNA |
922 | |
923 | #include <linux/netfilter/nfnetlink.h> | |
924 | #include <linux/netfilter/nfnetlink_conntrack.h> | |
925 | ||
926 | static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { | |
927 | [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, | |
928 | [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, | |
929 | }; | |
930 | ||
76b90019 FW |
931 | static int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], |
932 | struct nf_nat_range2 *range) | |
933 | { | |
934 | if (tb[CTA_PROTONAT_PORT_MIN]) { | |
935 | range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); | |
936 | range->max_proto.all = range->min_proto.all; | |
937 | range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; | |
938 | } | |
939 | if (tb[CTA_PROTONAT_PORT_MAX]) { | |
940 | range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); | |
941 | range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; | |
942 | } | |
943 | return 0; | |
944 | } | |
945 | ||
e6a7d3c0 PNA |
946 | static int nfnetlink_parse_nat_proto(struct nlattr *attr, |
947 | const struct nf_conn *ct, | |
2eb0f624 | 948 | struct nf_nat_range2 *range) |
e6a7d3c0 PNA |
949 | { |
950 | struct nlattr *tb[CTA_PROTONAT_MAX+1]; | |
e6a7d3c0 PNA |
951 | int err; |
952 | ||
8cb08174 JB |
953 | err = nla_parse_nested_deprecated(tb, CTA_PROTONAT_MAX, attr, |
954 | protonat_nla_policy, NULL); | |
e6a7d3c0 PNA |
955 | if (err < 0) |
956 | return err; | |
957 | ||
76b90019 | 958 | return nf_nat_l4proto_nlattr_to_range(tb, range); |
e6a7d3c0 PNA |
959 | } |
960 | ||
961 | static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { | |
c7232c99 PM |
962 | [CTA_NAT_V4_MINIP] = { .type = NLA_U32 }, |
963 | [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 }, | |
58a317f1 PM |
964 | [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) }, |
965 | [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) }, | |
329fb58a | 966 | [CTA_NAT_PROTO] = { .type = NLA_NESTED }, |
e6a7d3c0 PNA |
967 | }; |
968 | ||
096d0906 FW |
969 | static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], |
970 | struct nf_nat_range2 *range) | |
971 | { | |
972 | if (tb[CTA_NAT_V4_MINIP]) { | |
973 | range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); | |
974 | range->flags |= NF_NAT_RANGE_MAP_IPS; | |
975 | } | |
976 | ||
977 | if (tb[CTA_NAT_V4_MAXIP]) | |
978 | range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]); | |
979 | else | |
980 | range->max_addr.ip = range->min_addr.ip; | |
981 | ||
982 | return 0; | |
983 | } | |
984 | ||
985 | static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], | |
986 | struct nf_nat_range2 *range) | |
987 | { | |
988 | if (tb[CTA_NAT_V6_MINIP]) { | |
989 | nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP], | |
990 | sizeof(struct in6_addr)); | |
991 | range->flags |= NF_NAT_RANGE_MAP_IPS; | |
992 | } | |
993 | ||
994 | if (tb[CTA_NAT_V6_MAXIP]) | |
995 | nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP], | |
996 | sizeof(struct in6_addr)); | |
997 | else | |
998 | range->max_addr = range->min_addr; | |
999 | ||
1000 | return 0; | |
1001 | } | |
1002 | ||
e6a7d3c0 | 1003 | static int |
39938324 | 1004 | nfnetlink_parse_nat(const struct nlattr *nat, |
096d0906 | 1005 | const struct nf_conn *ct, struct nf_nat_range2 *range) |
e6a7d3c0 PNA |
1006 | { |
1007 | struct nlattr *tb[CTA_NAT_MAX+1]; | |
1008 | int err; | |
1009 | ||
1010 | memset(range, 0, sizeof(*range)); | |
1011 | ||
8cb08174 JB |
1012 | err = nla_parse_nested_deprecated(tb, CTA_NAT_MAX, nat, |
1013 | nat_nla_policy, NULL); | |
e6a7d3c0 PNA |
1014 | if (err < 0) |
1015 | return err; | |
1016 | ||
096d0906 FW |
1017 | switch (nf_ct_l3num(ct)) { |
1018 | case NFPROTO_IPV4: | |
1019 | err = nf_nat_ipv4_nlattr_to_range(tb, range); | |
1020 | break; | |
1021 | case NFPROTO_IPV6: | |
1022 | err = nf_nat_ipv6_nlattr_to_range(tb, range); | |
1023 | break; | |
1024 | default: | |
1025 | err = -EPROTONOSUPPORT; | |
1026 | break; | |
1027 | } | |
1028 | ||
1029 | if (err) | |
0eba801b | 1030 | return err; |
e6a7d3c0 PNA |
1031 | |
1032 | if (!tb[CTA_NAT_PROTO]) | |
0eba801b | 1033 | return 0; |
e6a7d3c0 | 1034 | |
0eba801b | 1035 | return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); |
e6a7d3c0 PNA |
1036 | } |
1037 | ||
0eba801b | 1038 | /* This function is called under rcu_read_lock() */ |
e6a7d3c0 PNA |
1039 | static int |
1040 | nfnetlink_parse_nat_setup(struct nf_conn *ct, | |
1041 | enum nf_nat_manip_type manip, | |
39938324 | 1042 | const struct nlattr *attr) |
e6a7d3c0 | 1043 | { |
2eb0f624 | 1044 | struct nf_nat_range2 range; |
c7232c99 | 1045 | int err; |
e6a7d3c0 | 1046 | |
0eba801b PNA |
1047 | /* Should not happen, restricted to creating new conntracks |
1048 | * via ctnetlink. | |
1049 | */ | |
1050 | if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) | |
1051 | return -EEXIST; | |
1052 | ||
0eba801b PNA |
1053 | /* No NAT information has been passed, allocate the null-binding */ |
1054 | if (attr == NULL) | |
7025bac4 | 1055 | return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0; |
0eba801b | 1056 | |
096d0906 | 1057 | err = nfnetlink_parse_nat(attr, ct, &range); |
c7232c99 PM |
1058 | if (err < 0) |
1059 | return err; | |
e6a7d3c0 | 1060 | |
ecfcdfec | 1061 | return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; |
e6a7d3c0 PNA |
1062 | } |
1063 | #else | |
1064 | static int | |
1065 | nfnetlink_parse_nat_setup(struct nf_conn *ct, | |
1066 | enum nf_nat_manip_type manip, | |
39938324 | 1067 | const struct nlattr *attr) |
e6a7d3c0 PNA |
1068 | { |
1069 | return -EOPNOTSUPP; | |
1070 | } | |
1071 | #endif | |
1072 | ||
544d5c7d PNA |
1073 | static struct nf_ct_helper_expectfn follow_master_nat = { |
1074 | .name = "nat-follow-master", | |
1075 | .expectfn = nf_nat_follow_master, | |
1076 | }; | |
1077 | ||
d164385e | 1078 | int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, |
1cd472bf FW |
1079 | const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count) |
1080 | { | |
1081 | struct nat_net *nat_net = net_generic(net, nat_net_id); | |
1082 | struct nf_nat_hooks_net *nat_proto_net; | |
1083 | struct nf_nat_lookup_hook_priv *priv; | |
1084 | unsigned int hooknum = ops->hooknum; | |
1085 | struct nf_hook_ops *nat_ops; | |
1086 | int i, ret; | |
1087 | ||
d164385e | 1088 | if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net))) |
1cd472bf FW |
1089 | return -EINVAL; |
1090 | ||
d164385e | 1091 | nat_proto_net = &nat_net->nat_proto_net[pf]; |
1cd472bf FW |
1092 | |
1093 | for (i = 0; i < ops_count; i++) { | |
1cd472bf FW |
1094 | if (orig_nat_ops[i].hooknum == hooknum) { |
1095 | hooknum = i; | |
1096 | break; | |
1097 | } | |
1098 | } | |
1099 | ||
1100 | if (WARN_ON_ONCE(i == ops_count)) | |
1101 | return -EINVAL; | |
1102 | ||
1103 | mutex_lock(&nf_nat_proto_mutex); | |
1104 | if (!nat_proto_net->nat_hook_ops) { | |
1105 | WARN_ON(nat_proto_net->users != 0); | |
1106 | ||
1107 | nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL); | |
1108 | if (!nat_ops) { | |
1109 | mutex_unlock(&nf_nat_proto_mutex); | |
1110 | return -ENOMEM; | |
1111 | } | |
1112 | ||
1113 | for (i = 0; i < ops_count; i++) { | |
1114 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); | |
1115 | if (priv) { | |
1116 | nat_ops[i].priv = priv; | |
1117 | continue; | |
1118 | } | |
1119 | mutex_unlock(&nf_nat_proto_mutex); | |
1120 | while (i) | |
1121 | kfree(nat_ops[--i].priv); | |
1122 | kfree(nat_ops); | |
1123 | return -ENOMEM; | |
1124 | } | |
1125 | ||
1126 | ret = nf_register_net_hooks(net, nat_ops, ops_count); | |
1127 | if (ret < 0) { | |
1128 | mutex_unlock(&nf_nat_proto_mutex); | |
1129 | for (i = 0; i < ops_count; i++) | |
1130 | kfree(nat_ops[i].priv); | |
1131 | kfree(nat_ops); | |
1132 | return ret; | |
1133 | } | |
1134 | ||
1135 | nat_proto_net->nat_hook_ops = nat_ops; | |
1136 | } | |
1137 | ||
1138 | nat_ops = nat_proto_net->nat_hook_ops; | |
1139 | priv = nat_ops[hooknum].priv; | |
1140 | if (WARN_ON_ONCE(!priv)) { | |
1141 | mutex_unlock(&nf_nat_proto_mutex); | |
1142 | return -EOPNOTSUPP; | |
1143 | } | |
1144 | ||
1145 | ret = nf_hook_entries_insert_raw(&priv->entries, ops); | |
1146 | if (ret == 0) | |
1147 | nat_proto_net->users++; | |
1148 | ||
1149 | mutex_unlock(&nf_nat_proto_mutex); | |
1150 | return ret; | |
1151 | } | |
1cd472bf | 1152 | |
d164385e FW |
1153 | void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, |
1154 | unsigned int ops_count) | |
1cd472bf FW |
1155 | { |
1156 | struct nat_net *nat_net = net_generic(net, nat_net_id); | |
1157 | struct nf_nat_hooks_net *nat_proto_net; | |
1158 | struct nf_nat_lookup_hook_priv *priv; | |
1159 | struct nf_hook_ops *nat_ops; | |
1160 | int hooknum = ops->hooknum; | |
1161 | int i; | |
1162 | ||
d164385e | 1163 | if (pf >= ARRAY_SIZE(nat_net->nat_proto_net)) |
1cd472bf FW |
1164 | return; |
1165 | ||
d164385e | 1166 | nat_proto_net = &nat_net->nat_proto_net[pf]; |
1cd472bf FW |
1167 | |
1168 | mutex_lock(&nf_nat_proto_mutex); | |
1169 | if (WARN_ON(nat_proto_net->users == 0)) | |
1170 | goto unlock; | |
1171 | ||
1172 | nat_proto_net->users--; | |
1173 | ||
1174 | nat_ops = nat_proto_net->nat_hook_ops; | |
1175 | for (i = 0; i < ops_count; i++) { | |
1176 | if (nat_ops[i].hooknum == hooknum) { | |
1177 | hooknum = i; | |
1178 | break; | |
1179 | } | |
1180 | } | |
1181 | if (WARN_ON_ONCE(i == ops_count)) | |
1182 | goto unlock; | |
1183 | priv = nat_ops[hooknum].priv; | |
1184 | nf_hook_entries_delete_raw(&priv->entries, ops); | |
1185 | ||
1186 | if (nat_proto_net->users == 0) { | |
1187 | nf_unregister_net_hooks(net, nat_ops, ops_count); | |
1188 | ||
1189 | for (i = 0; i < ops_count; i++) { | |
1190 | priv = nat_ops[i].priv; | |
1191 | kfree_rcu(priv, rcu_head); | |
1192 | } | |
1193 | ||
1194 | nat_proto_net->nat_hook_ops = NULL; | |
1195 | kfree(nat_ops); | |
1196 | } | |
1197 | unlock: | |
1198 | mutex_unlock(&nf_nat_proto_mutex); | |
1199 | } | |
1cd472bf FW |
1200 | |
1201 | static struct pernet_operations nat_net_ops = { | |
1202 | .id = &nat_net_id, | |
1203 | .size = sizeof(struct nat_net), | |
1204 | }; | |
1205 | ||
285c8a7a | 1206 | static const struct nf_nat_hook nat_hook = { |
2c205dd3 PNA |
1207 | .parse_nat_setup = nfnetlink_parse_nat_setup, |
1208 | #ifdef CONFIG_XFRM | |
1209 | .decode_session = __nf_nat_decode_session, | |
1210 | #endif | |
368982cd | 1211 | .manip_pkt = nf_nat_manip_pkt, |
1bc91a5d | 1212 | .remove_nat_bysrc = nf_nat_cleanup_conntrack, |
2c205dd3 PNA |
1213 | }; |
1214 | ||
5b1158e9 JK |
1215 | static int __init nf_nat_init(void) |
1216 | { | |
8073e960 | 1217 | int ret, i; |
2d59e5ca | 1218 | |
e1bf1687 FW |
1219 | /* Leave them the same for the moment. */ |
1220 | nf_nat_htable_size = nf_conntrack_htable_size; | |
b0ade851 GU |
1221 | if (nf_nat_htable_size < CONNTRACK_LOCKS) |
1222 | nf_nat_htable_size = CONNTRACK_LOCKS; | |
e1bf1687 FW |
1223 | |
1224 | nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); | |
1225 | if (!nf_nat_bysource) | |
1226 | return -ENOMEM; | |
a76ae1c8 | 1227 | |
b0ade851 | 1228 | for (i = 0; i < CONNTRACK_LOCKS; i++) |
8073e960 FW |
1229 | spin_lock_init(&nf_nat_locks[i]); |
1230 | ||
1cd472bf FW |
1231 | ret = register_pernet_subsys(&nat_net_ops); |
1232 | if (ret < 0) { | |
869f4fda | 1233 | kvfree(nf_nat_bysource); |
1cd472bf FW |
1234 | return ret; |
1235 | } | |
1236 | ||
c7232c99 | 1237 | nf_ct_helper_expectfn_register(&follow_master_nat); |
5b1158e9 | 1238 | |
2c205dd3 PNA |
1239 | WARN_ON(nf_nat_hook != NULL); |
1240 | RCU_INIT_POINTER(nf_nat_hook, &nat_hook); | |
1241 | ||
cbc1dd5b CZ |
1242 | ret = register_nf_nat_bpf(); |
1243 | if (ret < 0) { | |
1244 | RCU_INIT_POINTER(nf_nat_hook, NULL); | |
1245 | nf_ct_helper_expectfn_unregister(&follow_master_nat); | |
1246 | synchronize_net(); | |
1247 | unregister_pernet_subsys(&nat_net_ops); | |
1248 | kvfree(nf_nat_bysource); | |
1249 | } | |
1250 | ||
1251 | return ret; | |
5b1158e9 JK |
1252 | } |
1253 | ||
5b1158e9 JK |
1254 | static void __exit nf_nat_cleanup(void) |
1255 | { | |
8f23f35f | 1256 | struct nf_nat_proto_clean clean = {}; |
c7232c99 | 1257 | |
8f23f35f FW |
1258 | nf_ct_iterate_destroy(nf_nat_proto_clean, &clean); |
1259 | ||
544d5c7d | 1260 | nf_ct_helper_expectfn_unregister(&follow_master_nat); |
2c205dd3 PNA |
1261 | RCU_INIT_POINTER(nf_nat_hook, NULL); |
1262 | ||
e1bf1687 | 1263 | synchronize_net(); |
285189c7 | 1264 | kvfree(nf_nat_bysource); |
1cd472bf | 1265 | unregister_pernet_subsys(&nat_net_ops); |
5b1158e9 JK |
1266 | } |
1267 | ||
1268 | MODULE_LICENSE("GPL"); | |
94090b23 | 1269 | MODULE_DESCRIPTION("Network address translation core"); |
5b1158e9 JK |
1270 | |
1271 | module_init(nf_nat_init); | |
1272 | module_exit(nf_nat_cleanup); |