Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next
[linux-block.git] / net / ipv6 / route.c
CommitLineData
1da177e4
LT
1/*
2 * Linux INET6 implementation
3 * FIB front-end.
4 *
5 * Authors:
1ab1457c 6 * Pedro Roque <roque@di.fc.ul.pt>
1da177e4 7 *
1da177e4
LT
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14/* Changes:
15 *
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
c0bece9f
YH
23 * Ville Nuorvala
24 * Fixed routing subtrees.
1da177e4
LT
25 */
26
f3213831
JP
27#define pr_fmt(fmt) "IPv6: " fmt
28
4fc268d2 29#include <linux/capability.h>
1da177e4 30#include <linux/errno.h>
bc3b2d7f 31#include <linux/export.h>
1da177e4
LT
32#include <linux/types.h>
33#include <linux/times.h>
34#include <linux/socket.h>
35#include <linux/sockios.h>
36#include <linux/net.h>
37#include <linux/route.h>
38#include <linux/netdevice.h>
39#include <linux/in6.h>
7bc570c8 40#include <linux/mroute6.h>
1da177e4 41#include <linux/init.h>
1da177e4 42#include <linux/if_arp.h>
1da177e4
LT
43#include <linux/proc_fs.h>
44#include <linux/seq_file.h>
5b7c931d 45#include <linux/nsproxy.h>
5a0e3ad6 46#include <linux/slab.h>
35732d01 47#include <linux/jhash.h>
457c4cbc 48#include <net/net_namespace.h>
1da177e4
LT
49#include <net/snmp.h>
50#include <net/ipv6.h>
51#include <net/ip6_fib.h>
52#include <net/ip6_route.h>
53#include <net/ndisc.h>
54#include <net/addrconf.h>
55#include <net/tcp.h>
56#include <linux/rtnetlink.h>
57#include <net/dst.h>
904af04d 58#include <net/dst_metadata.h>
1da177e4 59#include <net/xfrm.h>
8d71740c 60#include <net/netevent.h>
21713ebc 61#include <net/netlink.h>
51ebd318 62#include <net/nexthop.h>
19e42e45 63#include <net/lwtunnel.h>
904af04d 64#include <net/ip_tunnels.h>
ca254490 65#include <net/l3mdev.h>
b811580d 66#include <trace/events/fib6.h>
1da177e4 67
7c0f6ba6 68#include <linux/uaccess.h>
1da177e4
LT
69
70#ifdef CONFIG_SYSCTL
71#include <linux/sysctl.h>
72#endif
73
afc154e9 74enum rt6_nud_state {
7e980569
JB
75 RT6_NUD_FAIL_HARD = -3,
76 RT6_NUD_FAIL_PROBE = -2,
77 RT6_NUD_FAIL_DO_RR = -1,
afc154e9
HFS
78 RT6_NUD_SUCCEED = 1
79};
80
83a09abd 81static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
1da177e4 82static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 83static unsigned int ip6_default_advmss(const struct dst_entry *dst);
ebb762f2 84static unsigned int ip6_mtu(const struct dst_entry *dst);
1da177e4
LT
85static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86static void ip6_dst_destroy(struct dst_entry *);
87static void ip6_dst_ifdown(struct dst_entry *,
88 struct net_device *dev, int how);
569d3645 89static int ip6_dst_gc(struct dst_ops *ops);
1da177e4
LT
90
91static int ip6_pkt_discard(struct sk_buff *skb);
ede2059d 92static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
7150aede 93static int ip6_pkt_prohibit(struct sk_buff *skb);
ede2059d 94static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
1da177e4 95static void ip6_link_failure(struct sk_buff *skb);
6700c270
DM
96static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 struct sk_buff *skb, u32 mtu);
98static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 struct sk_buff *skb);
4b32b5ad 100static void rt6_dst_from_metrics_check(struct rt6_info *rt);
52bd4c0c 101static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
16a16cd3
DA
102static size_t rt6_nlmsg_size(struct rt6_info *rt);
103static int rt6_fill_node(struct net *net,
104 struct sk_buff *skb, struct rt6_info *rt,
105 struct in6_addr *dst, struct in6_addr *src,
106 int iif, int type, u32 portid, u32 seq,
107 unsigned int flags);
35732d01
WW
108static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 struct in6_addr *daddr,
110 struct in6_addr *saddr);
1da177e4 111
70ceb4f5 112#ifdef CONFIG_IPV6_ROUTE_INFO
efa2cea0 113static struct rt6_info *rt6_add_route_info(struct net *net,
b71d1d42 114 const struct in6_addr *prefix, int prefixlen,
830218c1
DA
115 const struct in6_addr *gwaddr,
116 struct net_device *dev,
95c96174 117 unsigned int pref);
efa2cea0 118static struct rt6_info *rt6_get_route_info(struct net *net,
b71d1d42 119 const struct in6_addr *prefix, int prefixlen,
830218c1
DA
120 const struct in6_addr *gwaddr,
121 struct net_device *dev);
70ceb4f5
YH
122#endif
123
8d0b94af
MKL
124struct uncached_list {
125 spinlock_t lock;
126 struct list_head head;
127};
128
129static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
510c321b 131void rt6_uncached_list_add(struct rt6_info *rt)
8d0b94af
MKL
132{
133 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
8d0b94af
MKL
135 rt->rt6i_uncached_list = ul;
136
137 spin_lock_bh(&ul->lock);
138 list_add_tail(&rt->rt6i_uncached, &ul->head);
139 spin_unlock_bh(&ul->lock);
140}
141
510c321b 142void rt6_uncached_list_del(struct rt6_info *rt)
8d0b94af
MKL
143{
144 if (!list_empty(&rt->rt6i_uncached)) {
145 struct uncached_list *ul = rt->rt6i_uncached_list;
81eb8447 146 struct net *net = dev_net(rt->dst.dev);
8d0b94af
MKL
147
148 spin_lock_bh(&ul->lock);
149 list_del(&rt->rt6i_uncached);
81eb8447 150 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
8d0b94af
MKL
151 spin_unlock_bh(&ul->lock);
152 }
153}
154
155static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156{
157 struct net_device *loopback_dev = net->loopback_dev;
158 int cpu;
159
e332bc67
EB
160 if (dev == loopback_dev)
161 return;
162
8d0b94af
MKL
163 for_each_possible_cpu(cpu) {
164 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 struct rt6_info *rt;
166
167 spin_lock_bh(&ul->lock);
168 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169 struct inet6_dev *rt_idev = rt->rt6i_idev;
170 struct net_device *rt_dev = rt->dst.dev;
171
e332bc67 172 if (rt_idev->dev == dev) {
8d0b94af
MKL
173 rt->rt6i_idev = in6_dev_get(loopback_dev);
174 in6_dev_put(rt_idev);
175 }
176
e332bc67 177 if (rt_dev == dev) {
8d0b94af
MKL
178 rt->dst.dev = loopback_dev;
179 dev_hold(rt->dst.dev);
180 dev_put(rt_dev);
181 }
182 }
183 spin_unlock_bh(&ul->lock);
184 }
185}
186
d52d3997
MKL
187static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188{
3a2232e9 189 return dst_metrics_write_ptr(&rt->from->dst);
d52d3997
MKL
190}
191
06582540
DM
192static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193{
4b32b5ad 194 struct rt6_info *rt = (struct rt6_info *)dst;
06582540 195
d52d3997
MKL
196 if (rt->rt6i_flags & RTF_PCPU)
197 return rt6_pcpu_cow_metrics(rt);
198 else if (rt->rt6i_flags & RTF_CACHE)
4b32b5ad
MKL
199 return NULL;
200 else
3b471175 201 return dst_cow_metrics_generic(dst, old);
06582540
DM
202}
203
f894cbf8
DM
204static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205 struct sk_buff *skb,
206 const void *daddr)
39232973
DM
207{
208 struct in6_addr *p = &rt->rt6i_gateway;
209
a7563f34 210 if (!ipv6_addr_any(p))
39232973 211 return (const void *) p;
f894cbf8
DM
212 else if (skb)
213 return &ipv6_hdr(skb)->daddr;
39232973
DM
214 return daddr;
215}
216
f894cbf8
DM
217static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218 struct sk_buff *skb,
219 const void *daddr)
d3aaeb38 220{
39232973
DM
221 struct rt6_info *rt = (struct rt6_info *) dst;
222 struct neighbour *n;
223
f894cbf8 224 daddr = choose_neigh_daddr(rt, skb, daddr);
8e022ee6 225 n = __ipv6_neigh_lookup(dst->dev, daddr);
f83c7790
DM
226 if (n)
227 return n;
228 return neigh_create(&nd_tbl, daddr, dst->dev);
229}
230
63fca65d
JA
231static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232{
233 struct net_device *dev = dst->dev;
234 struct rt6_info *rt = (struct rt6_info *)dst;
235
236 daddr = choose_neigh_daddr(rt, NULL, daddr);
237 if (!daddr)
238 return;
239 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240 return;
241 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242 return;
243 __ipv6_confirm_neigh(dev, daddr);
244}
245
9a7ec3a9 246static struct dst_ops ip6_dst_ops_template = {
1da177e4 247 .family = AF_INET6,
1da177e4
LT
248 .gc = ip6_dst_gc,
249 .gc_thresh = 1024,
250 .check = ip6_dst_check,
0dbaee3b 251 .default_advmss = ip6_default_advmss,
ebb762f2 252 .mtu = ip6_mtu,
06582540 253 .cow_metrics = ipv6_cow_metrics,
1da177e4
LT
254 .destroy = ip6_dst_destroy,
255 .ifdown = ip6_dst_ifdown,
256 .negative_advice = ip6_negative_advice,
257 .link_failure = ip6_link_failure,
258 .update_pmtu = ip6_rt_update_pmtu,
6e157b6a 259 .redirect = rt6_do_redirect,
9f8955cc 260 .local_out = __ip6_local_out,
d3aaeb38 261 .neigh_lookup = ip6_neigh_lookup,
63fca65d 262 .confirm_neigh = ip6_confirm_neigh,
1da177e4
LT
263};
264
ebb762f2 265static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 266{
618f9bc7
SK
267 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269 return mtu ? : dst->dev->mtu;
ec831ea7
RD
270}
271
6700c270
DM
272static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273 struct sk_buff *skb, u32 mtu)
14e50e57
DM
274{
275}
276
6700c270
DM
277static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 struct sk_buff *skb)
b587ee3b
DM
279{
280}
281
14e50e57
DM
282static struct dst_ops ip6_dst_blackhole_ops = {
283 .family = AF_INET6,
14e50e57
DM
284 .destroy = ip6_dst_destroy,
285 .check = ip6_dst_check,
ebb762f2 286 .mtu = ip6_blackhole_mtu,
214f45c9 287 .default_advmss = ip6_default_advmss,
14e50e57 288 .update_pmtu = ip6_rt_blackhole_update_pmtu,
b587ee3b 289 .redirect = ip6_rt_blackhole_redirect,
0a1f5962 290 .cow_metrics = dst_cow_metrics_generic,
d3aaeb38 291 .neigh_lookup = ip6_neigh_lookup,
14e50e57
DM
292};
293
62fa8a84 294static const u32 ip6_template_metrics[RTAX_MAX] = {
14edd87d 295 [RTAX_HOPLIMIT - 1] = 0,
62fa8a84
DM
296};
297
fb0af4c7 298static const struct rt6_info ip6_null_entry_template = {
d8d1f30b
CG
299 .dst = {
300 .__refcnt = ATOMIC_INIT(1),
301 .__use = 1,
2c20cbd7 302 .obsolete = DST_OBSOLETE_FORCE_CHK,
d8d1f30b 303 .error = -ENETUNREACH,
d8d1f30b
CG
304 .input = ip6_pkt_discard,
305 .output = ip6_pkt_discard_out,
1da177e4
LT
306 },
307 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
4f724279 308 .rt6i_protocol = RTPROT_KERNEL,
1da177e4
LT
309 .rt6i_metric = ~(u32) 0,
310 .rt6i_ref = ATOMIC_INIT(1),
311};
312
101367c2
TG
313#ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
fb0af4c7 315static const struct rt6_info ip6_prohibit_entry_template = {
d8d1f30b
CG
316 .dst = {
317 .__refcnt = ATOMIC_INIT(1),
318 .__use = 1,
2c20cbd7 319 .obsolete = DST_OBSOLETE_FORCE_CHK,
d8d1f30b 320 .error = -EACCES,
d8d1f30b
CG
321 .input = ip6_pkt_prohibit,
322 .output = ip6_pkt_prohibit_out,
101367c2
TG
323 },
324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
4f724279 325 .rt6i_protocol = RTPROT_KERNEL,
101367c2
TG
326 .rt6i_metric = ~(u32) 0,
327 .rt6i_ref = ATOMIC_INIT(1),
328};
329
fb0af4c7 330static const struct rt6_info ip6_blk_hole_entry_template = {
d8d1f30b
CG
331 .dst = {
332 .__refcnt = ATOMIC_INIT(1),
333 .__use = 1,
2c20cbd7 334 .obsolete = DST_OBSOLETE_FORCE_CHK,
d8d1f30b 335 .error = -EINVAL,
d8d1f30b 336 .input = dst_discard,
ede2059d 337 .output = dst_discard_out,
101367c2
TG
338 },
339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
4f724279 340 .rt6i_protocol = RTPROT_KERNEL,
101367c2
TG
341 .rt6i_metric = ~(u32) 0,
342 .rt6i_ref = ATOMIC_INIT(1),
343};
344
345#endif
346
ebfa45f0
MKL
347static void rt6_info_init(struct rt6_info *rt)
348{
349 struct dst_entry *dst = &rt->dst;
350
351 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 INIT_LIST_HEAD(&rt->rt6i_siblings);
353 INIT_LIST_HEAD(&rt->rt6i_uncached);
354}
355
1da177e4 356/* allocate dst with ip6_dst_ops */
d52d3997
MKL
357static struct rt6_info *__ip6_dst_alloc(struct net *net,
358 struct net_device *dev,
ad706862 359 int flags)
1da177e4 360{
97bab73f 361 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
b2a9c0ed 362 1, DST_OBSOLETE_FORCE_CHK, flags);
cf911662 363
81eb8447 364 if (rt) {
ebfa45f0 365 rt6_info_init(rt);
81eb8447
WW
366 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367 }
8104891b 368
cf911662 369 return rt;
1da177e4
LT
370}
371
9ab179d8
DA
372struct rt6_info *ip6_dst_alloc(struct net *net,
373 struct net_device *dev,
374 int flags)
d52d3997 375{
ad706862 376 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
d52d3997
MKL
377
378 if (rt) {
379 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
bfd8e5a4 380 if (!rt->rt6i_pcpu) {
587fea74 381 dst_release_immediate(&rt->dst);
d52d3997
MKL
382 return NULL;
383 }
384 }
385
386 return rt;
387}
9ab179d8 388EXPORT_SYMBOL(ip6_dst_alloc);
d52d3997 389
1da177e4
LT
390static void ip6_dst_destroy(struct dst_entry *dst)
391{
392 struct rt6_info *rt = (struct rt6_info *)dst;
35732d01 393 struct rt6_exception_bucket *bucket;
3a2232e9 394 struct rt6_info *from = rt->from;
8d0b94af 395 struct inet6_dev *idev;
1da177e4 396
4b32b5ad 397 dst_destroy_metrics_generic(dst);
87775312 398 free_percpu(rt->rt6i_pcpu);
8d0b94af
MKL
399 rt6_uncached_list_del(rt);
400
401 idev = rt->rt6i_idev;
38308473 402 if (idev) {
1da177e4
LT
403 rt->rt6i_idev = NULL;
404 in6_dev_put(idev);
1ab1457c 405 }
35732d01
WW
406 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407 if (bucket) {
408 rt->rt6i_exception_bucket = NULL;
409 kfree(bucket);
410 }
1716a961 411
3a2232e9
DM
412 rt->from = NULL;
413 dst_release(&from->dst);
b3419363
DM
414}
415
1da177e4
LT
416static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417 int how)
418{
419 struct rt6_info *rt = (struct rt6_info *)dst;
420 struct inet6_dev *idev = rt->rt6i_idev;
5a3e55d6 421 struct net_device *loopback_dev =
c346dca1 422 dev_net(dev)->loopback_dev;
1da177e4 423
e5645f51
WW
424 if (idev && idev->dev != loopback_dev) {
425 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426 if (loopback_idev) {
427 rt->rt6i_idev = loopback_idev;
428 in6_dev_put(idev);
97cac082 429 }
1da177e4
LT
430 }
431}
432
5973fb1e
MKL
433static bool __rt6_check_expired(const struct rt6_info *rt)
434{
435 if (rt->rt6i_flags & RTF_EXPIRES)
436 return time_after(jiffies, rt->dst.expires);
437 else
438 return false;
439}
440
a50feda5 441static bool rt6_check_expired(const struct rt6_info *rt)
1da177e4 442{
1716a961
G
443 if (rt->rt6i_flags & RTF_EXPIRES) {
444 if (time_after(jiffies, rt->dst.expires))
a50feda5 445 return true;
3a2232e9 446 } else if (rt->from) {
1e2ea8ad 447 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
3a2232e9 448 rt6_check_expired(rt->from);
1716a961 449 }
a50feda5 450 return false;
1da177e4
LT
451}
452
b4bac172
DA
453static struct rt6_info *rt6_multipath_select(const struct net *net,
454 struct rt6_info *match,
52bd4c0c 455 struct flowi6 *fl6, int oif,
b75cc8f9 456 const struct sk_buff *skb,
52bd4c0c 457 int strict)
51ebd318
ND
458{
459 struct rt6_info *sibling, *next_sibling;
51ebd318 460
b673d6cc
JS
461 /* We might have already computed the hash for ICMPv6 errors. In such
462 * case it will always be non-zero. Otherwise now is the time to do it.
463 */
464 if (!fl6->mp_hash)
b4bac172 465 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
b673d6cc 466
3d709f69
IS
467 if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
468 return match;
469
470 list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
471 rt6i_siblings) {
472 if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
473 continue;
474 if (rt6_score_route(sibling, oif, strict) < 0)
475 break;
476 match = sibling;
477 break;
478 }
479
51ebd318
ND
480 return match;
481}
482
1da177e4 483/*
66f5d6ce 484 * Route lookup. rcu_read_lock() should be held.
1da177e4
LT
485 */
486
8ed67789
DL
487static inline struct rt6_info *rt6_device_match(struct net *net,
488 struct rt6_info *rt,
b71d1d42 489 const struct in6_addr *saddr,
1da177e4 490 int oif,
d420895e 491 int flags)
1da177e4
LT
492{
493 struct rt6_info *local = NULL;
494 struct rt6_info *sprt;
495
8067bb8c
IS
496 if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
497 return rt;
dd3abc4e 498
071fb37e 499 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
d1918542 500 struct net_device *dev = sprt->dst.dev;
dd3abc4e 501
8067bb8c
IS
502 if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
503 continue;
504
dd3abc4e 505 if (oif) {
1da177e4
LT
506 if (dev->ifindex == oif)
507 return sprt;
508 if (dev->flags & IFF_LOOPBACK) {
38308473 509 if (!sprt->rt6i_idev ||
1da177e4 510 sprt->rt6i_idev->dev->ifindex != oif) {
17fb0b2b 511 if (flags & RT6_LOOKUP_F_IFACE)
1da177e4 512 continue;
17fb0b2b
DA
513 if (local &&
514 local->rt6i_idev->dev->ifindex == oif)
1da177e4
LT
515 continue;
516 }
517 local = sprt;
518 }
dd3abc4e
YH
519 } else {
520 if (ipv6_chk_addr(net, saddr, dev,
521 flags & RT6_LOOKUP_F_IFACE))
522 return sprt;
1da177e4 523 }
dd3abc4e 524 }
1da177e4 525
dd3abc4e 526 if (oif) {
1da177e4
LT
527 if (local)
528 return local;
529
d420895e 530 if (flags & RT6_LOOKUP_F_IFACE)
8ed67789 531 return net->ipv6.ip6_null_entry;
1da177e4 532 }
8067bb8c
IS
533
534 return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
1da177e4
LT
535}
536
27097255 537#ifdef CONFIG_IPV6_ROUTER_PREF
c2f17e82
HFS
538struct __rt6_probe_work {
539 struct work_struct work;
540 struct in6_addr target;
541 struct net_device *dev;
542};
543
544static void rt6_probe_deferred(struct work_struct *w)
545{
546 struct in6_addr mcaddr;
547 struct __rt6_probe_work *work =
548 container_of(w, struct __rt6_probe_work, work);
549
550 addrconf_addr_solict_mult(&work->target, &mcaddr);
adc176c5 551 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
c2f17e82 552 dev_put(work->dev);
662f5533 553 kfree(work);
c2f17e82
HFS
554}
555
27097255
YH
556static void rt6_probe(struct rt6_info *rt)
557{
990edb42 558 struct __rt6_probe_work *work;
f2c31e32 559 struct neighbour *neigh;
27097255
YH
560 /*
561 * Okay, this does not seem to be appropriate
562 * for now, however, we need to check if it
563 * is really so; aka Router Reachability Probing.
564 *
565 * Router Reachability Probe MUST be rate-limited
566 * to no more than one per minute.
567 */
2152caea 568 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
7ff74a59 569 return;
2152caea
YH
570 rcu_read_lock_bh();
571 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
572 if (neigh) {
8d6c31bf
MKL
573 if (neigh->nud_state & NUD_VALID)
574 goto out;
575
990edb42 576 work = NULL;
2152caea 577 write_lock(&neigh->lock);
990edb42
MKL
578 if (!(neigh->nud_state & NUD_VALID) &&
579 time_after(jiffies,
580 neigh->updated +
581 rt->rt6i_idev->cnf.rtr_probe_interval)) {
582 work = kmalloc(sizeof(*work), GFP_ATOMIC);
583 if (work)
584 __neigh_set_probe_once(neigh);
c2f17e82 585 }
2152caea 586 write_unlock(&neigh->lock);
990edb42
MKL
587 } else {
588 work = kmalloc(sizeof(*work), GFP_ATOMIC);
f2c31e32 589 }
990edb42
MKL
590
591 if (work) {
592 INIT_WORK(&work->work, rt6_probe_deferred);
593 work->target = rt->rt6i_gateway;
594 dev_hold(rt->dst.dev);
595 work->dev = rt->dst.dev;
596 schedule_work(&work->work);
597 }
598
8d6c31bf 599out:
2152caea 600 rcu_read_unlock_bh();
27097255
YH
601}
602#else
603static inline void rt6_probe(struct rt6_info *rt)
604{
27097255
YH
605}
606#endif
607
1da177e4 608/*
554cfb7e 609 * Default Router Selection (RFC 2461 6.3.6)
1da177e4 610 */
b6f99a21 611static inline int rt6_check_dev(struct rt6_info *rt, int oif)
554cfb7e 612{
d1918542 613 struct net_device *dev = rt->dst.dev;
161980f4 614 if (!oif || dev->ifindex == oif)
554cfb7e 615 return 2;
161980f4
DM
616 if ((dev->flags & IFF_LOOPBACK) &&
617 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
618 return 1;
619 return 0;
554cfb7e 620}
1da177e4 621
afc154e9 622static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
1da177e4 623{
f2c31e32 624 struct neighbour *neigh;
afc154e9 625 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
f2c31e32 626
4d0c5911
YH
627 if (rt->rt6i_flags & RTF_NONEXTHOP ||
628 !(rt->rt6i_flags & RTF_GATEWAY))
afc154e9 629 return RT6_NUD_SUCCEED;
145a3621
YH
630
631 rcu_read_lock_bh();
632 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
633 if (neigh) {
634 read_lock(&neigh->lock);
554cfb7e 635 if (neigh->nud_state & NUD_VALID)
afc154e9 636 ret = RT6_NUD_SUCCEED;
398bcbeb 637#ifdef CONFIG_IPV6_ROUTER_PREF
a5a81f0b 638 else if (!(neigh->nud_state & NUD_FAILED))
afc154e9 639 ret = RT6_NUD_SUCCEED;
7e980569
JB
640 else
641 ret = RT6_NUD_FAIL_PROBE;
398bcbeb 642#endif
145a3621 643 read_unlock(&neigh->lock);
afc154e9
HFS
644 } else {
645 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
7e980569 646 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
a5a81f0b 647 }
145a3621
YH
648 rcu_read_unlock_bh();
649
a5a81f0b 650 return ret;
1da177e4
LT
651}
652
554cfb7e
YH
653static int rt6_score_route(struct rt6_info *rt, int oif,
654 int strict)
1da177e4 655{
a5a81f0b 656 int m;
1ab1457c 657
4d0c5911 658 m = rt6_check_dev(rt, oif);
77d16f45 659 if (!m && (strict & RT6_LOOKUP_F_IFACE))
afc154e9 660 return RT6_NUD_FAIL_HARD;
ebacaaa0
YH
661#ifdef CONFIG_IPV6_ROUTER_PREF
662 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
663#endif
afc154e9
HFS
664 if (strict & RT6_LOOKUP_F_REACHABLE) {
665 int n = rt6_check_neigh(rt);
666 if (n < 0)
667 return n;
668 }
554cfb7e
YH
669 return m;
670}
671
f11e6659 672static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
afc154e9
HFS
673 int *mpri, struct rt6_info *match,
674 bool *do_rr)
554cfb7e 675{
f11e6659 676 int m;
afc154e9 677 bool match_do_rr = false;
35103d11 678 struct inet6_dev *idev = rt->rt6i_idev;
35103d11 679
8067bb8c
IS
680 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
681 goto out;
682
14c5206c
IS
683 if (idev->cnf.ignore_routes_with_linkdown &&
684 rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
d5d32e4b 685 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
35103d11 686 goto out;
f11e6659
DM
687
688 if (rt6_check_expired(rt))
689 goto out;
690
691 m = rt6_score_route(rt, oif, strict);
7e980569 692 if (m == RT6_NUD_FAIL_DO_RR) {
afc154e9
HFS
693 match_do_rr = true;
694 m = 0; /* lowest valid score */
7e980569 695 } else if (m == RT6_NUD_FAIL_HARD) {
f11e6659 696 goto out;
afc154e9
HFS
697 }
698
699 if (strict & RT6_LOOKUP_F_REACHABLE)
700 rt6_probe(rt);
f11e6659 701
7e980569 702 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
f11e6659 703 if (m > *mpri) {
afc154e9 704 *do_rr = match_do_rr;
f11e6659
DM
705 *mpri = m;
706 match = rt;
f11e6659 707 }
f11e6659
DM
708out:
709 return match;
710}
711
712static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
8d1040e8 713 struct rt6_info *leaf,
f11e6659 714 struct rt6_info *rr_head,
afc154e9
HFS
715 u32 metric, int oif, int strict,
716 bool *do_rr)
f11e6659 717{
9fbdcfaf 718 struct rt6_info *rt, *match, *cont;
554cfb7e 719 int mpri = -1;
1da177e4 720
f11e6659 721 match = NULL;
9fbdcfaf 722 cont = NULL;
071fb37e 723 for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
9fbdcfaf
SK
724 if (rt->rt6i_metric != metric) {
725 cont = rt;
726 break;
727 }
728
729 match = find_match(rt, oif, strict, &mpri, match, do_rr);
730 }
731
66f5d6ce 732 for (rt = leaf; rt && rt != rr_head;
071fb37e 733 rt = rcu_dereference(rt->rt6_next)) {
9fbdcfaf
SK
734 if (rt->rt6i_metric != metric) {
735 cont = rt;
736 break;
737 }
738
afc154e9 739 match = find_match(rt, oif, strict, &mpri, match, do_rr);
9fbdcfaf
SK
740 }
741
742 if (match || !cont)
743 return match;
744
071fb37e 745 for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
afc154e9 746 match = find_match(rt, oif, strict, &mpri, match, do_rr);
1da177e4 747
f11e6659
DM
748 return match;
749}
1da177e4 750
8d1040e8
WW
751static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
752 int oif, int strict)
f11e6659 753{
66f5d6ce 754 struct rt6_info *leaf = rcu_dereference(fn->leaf);
f11e6659 755 struct rt6_info *match, *rt0;
afc154e9 756 bool do_rr = false;
17ecf590 757 int key_plen;
1da177e4 758
87b1af8d 759 if (!leaf || leaf == net->ipv6.ip6_null_entry)
8d1040e8
WW
760 return net->ipv6.ip6_null_entry;
761
66f5d6ce 762 rt0 = rcu_dereference(fn->rr_ptr);
f11e6659 763 if (!rt0)
66f5d6ce 764 rt0 = leaf;
1da177e4 765
17ecf590
WW
766 /* Double check to make sure fn is not an intermediate node
767 * and fn->leaf does not points to its child's leaf
768 * (This might happen if all routes under fn are deleted from
769 * the tree and fib6_repair_tree() is called on the node.)
770 */
771 key_plen = rt0->rt6i_dst.plen;
772#ifdef CONFIG_IPV6_SUBTREES
773 if (rt0->rt6i_src.plen)
774 key_plen = rt0->rt6i_src.plen;
775#endif
776 if (fn->fn_bit != key_plen)
777 return net->ipv6.ip6_null_entry;
778
8d1040e8 779 match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
afc154e9 780 &do_rr);
1da177e4 781
afc154e9 782 if (do_rr) {
071fb37e 783 struct rt6_info *next = rcu_dereference(rt0->rt6_next);
f11e6659 784
554cfb7e 785 /* no entries matched; do round-robin */
f11e6659 786 if (!next || next->rt6i_metric != rt0->rt6i_metric)
8d1040e8 787 next = leaf;
f11e6659 788
66f5d6ce
WW
789 if (next != rt0) {
790 spin_lock_bh(&leaf->rt6i_table->tb6_lock);
791 /* make sure next is not being deleted from the tree */
792 if (next->rt6i_node)
793 rcu_assign_pointer(fn->rr_ptr, next);
794 spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
795 }
1da177e4 796 }
1da177e4 797
a02cec21 798 return match ? match : net->ipv6.ip6_null_entry;
1da177e4
LT
799}
800
8b9df265
MKL
801static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
802{
803 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
804}
805
70ceb4f5
YH
806#ifdef CONFIG_IPV6_ROUTE_INFO
807int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
b71d1d42 808 const struct in6_addr *gwaddr)
70ceb4f5 809{
c346dca1 810 struct net *net = dev_net(dev);
70ceb4f5
YH
811 struct route_info *rinfo = (struct route_info *) opt;
812 struct in6_addr prefix_buf, *prefix;
813 unsigned int pref;
4bed72e4 814 unsigned long lifetime;
70ceb4f5
YH
815 struct rt6_info *rt;
816
817 if (len < sizeof(struct route_info)) {
818 return -EINVAL;
819 }
820
821 /* Sanity check for prefix_len and length */
822 if (rinfo->length > 3) {
823 return -EINVAL;
824 } else if (rinfo->prefix_len > 128) {
825 return -EINVAL;
826 } else if (rinfo->prefix_len > 64) {
827 if (rinfo->length < 2) {
828 return -EINVAL;
829 }
830 } else if (rinfo->prefix_len > 0) {
831 if (rinfo->length < 1) {
832 return -EINVAL;
833 }
834 }
835
836 pref = rinfo->route_pref;
837 if (pref == ICMPV6_ROUTER_PREF_INVALID)
3933fc95 838 return -EINVAL;
70ceb4f5 839
4bed72e4 840 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
70ceb4f5
YH
841
842 if (rinfo->length == 3)
843 prefix = (struct in6_addr *)rinfo->prefix;
844 else {
845 /* this function is safe */
846 ipv6_addr_prefix(&prefix_buf,
847 (struct in6_addr *)rinfo->prefix,
848 rinfo->prefix_len);
849 prefix = &prefix_buf;
850 }
851
f104a567
DJ
852 if (rinfo->prefix_len == 0)
853 rt = rt6_get_dflt_router(gwaddr, dev);
854 else
855 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
830218c1 856 gwaddr, dev);
70ceb4f5
YH
857
858 if (rt && !lifetime) {
e0a1ad73 859 ip6_del_rt(rt);
70ceb4f5
YH
860 rt = NULL;
861 }
862
863 if (!rt && lifetime)
830218c1
DA
864 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
865 dev, pref);
70ceb4f5
YH
866 else if (rt)
867 rt->rt6i_flags = RTF_ROUTEINFO |
868 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
869
870 if (rt) {
1716a961
G
871 if (!addrconf_finite_timeout(lifetime))
872 rt6_clean_expires(rt);
873 else
874 rt6_set_expires(rt, jiffies + HZ * lifetime);
875
94e187c0 876 ip6_rt_put(rt);
70ceb4f5
YH
877 }
878 return 0;
879}
880#endif
881
a3c00e46
MKL
882static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
883 struct in6_addr *saddr)
884{
66f5d6ce 885 struct fib6_node *pn, *sn;
a3c00e46
MKL
886 while (1) {
887 if (fn->fn_flags & RTN_TL_ROOT)
888 return NULL;
66f5d6ce
WW
889 pn = rcu_dereference(fn->parent);
890 sn = FIB6_SUBTREE(pn);
891 if (sn && sn != fn)
892 fn = fib6_lookup(sn, NULL, saddr);
a3c00e46
MKL
893 else
894 fn = pn;
895 if (fn->fn_flags & RTN_RTINFO)
896 return fn;
897 }
898}
c71099ac 899
d3843fe5
WW
900static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
901 bool null_fallback)
902{
903 struct rt6_info *rt = *prt;
904
905 if (dst_hold_safe(&rt->dst))
906 return true;
907 if (null_fallback) {
908 rt = net->ipv6.ip6_null_entry;
909 dst_hold(&rt->dst);
910 } else {
911 rt = NULL;
912 }
913 *prt = rt;
914 return false;
915}
916
8ed67789
DL
917static struct rt6_info *ip6_pol_route_lookup(struct net *net,
918 struct fib6_table *table,
b75cc8f9
DA
919 struct flowi6 *fl6,
920 const struct sk_buff *skb,
921 int flags)
1da177e4 922{
2b760fcf 923 struct rt6_info *rt, *rt_cache;
1da177e4 924 struct fib6_node *fn;
1da177e4 925
66f5d6ce 926 rcu_read_lock();
4c9483b2 927 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
c71099ac 928restart:
66f5d6ce
WW
929 rt = rcu_dereference(fn->leaf);
930 if (!rt) {
931 rt = net->ipv6.ip6_null_entry;
932 } else {
933 rt = rt6_device_match(net, rt, &fl6->saddr,
934 fl6->flowi6_oif, flags);
935 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
b4bac172 936 rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
b75cc8f9 937 skb, flags);
66f5d6ce 938 }
a3c00e46
MKL
939 if (rt == net->ipv6.ip6_null_entry) {
940 fn = fib6_backtrack(fn, &fl6->saddr);
941 if (fn)
942 goto restart;
943 }
2b760fcf
WW
944 /* Search through exception table */
945 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
946 if (rt_cache)
947 rt = rt_cache;
948
d3843fe5
WW
949 if (ip6_hold_safe(net, &rt, true))
950 dst_use_noref(&rt->dst, jiffies);
951
66f5d6ce 952 rcu_read_unlock();
b811580d 953
b65f164d 954 trace_fib6_table_lookup(net, rt, table, fl6);
b811580d 955
c71099ac
TG
956 return rt;
957
958}
959
67ba4152 960struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
b75cc8f9 961 const struct sk_buff *skb, int flags)
ea6e574e 962{
b75cc8f9 963 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
ea6e574e
FW
964}
965EXPORT_SYMBOL_GPL(ip6_route_lookup);
966
9acd9f3a 967struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
b75cc8f9
DA
968 const struct in6_addr *saddr, int oif,
969 const struct sk_buff *skb, int strict)
c71099ac 970{
4c9483b2
DM
971 struct flowi6 fl6 = {
972 .flowi6_oif = oif,
973 .daddr = *daddr,
c71099ac
TG
974 };
975 struct dst_entry *dst;
77d16f45 976 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
c71099ac 977
adaa70bb 978 if (saddr) {
4c9483b2 979 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
adaa70bb
TG
980 flags |= RT6_LOOKUP_F_HAS_SADDR;
981 }
982
b75cc8f9 983 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
c71099ac
TG
984 if (dst->error == 0)
985 return (struct rt6_info *) dst;
986
987 dst_release(dst);
988
1da177e4
LT
989 return NULL;
990}
7159039a
YH
991EXPORT_SYMBOL(rt6_lookup);
992
c71099ac 993/* ip6_ins_rt is called with FREE table->tb6_lock.
1cfb71ee
WW
994 * It takes new route entry, the addition fails by any reason the
995 * route is released.
996 * Caller must hold dst before calling it.
1da177e4
LT
997 */
998
e5fd387a 999static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
333c4301
DA
1000 struct mx6_config *mxc,
1001 struct netlink_ext_ack *extack)
1da177e4
LT
1002{
1003 int err;
c71099ac 1004 struct fib6_table *table;
1da177e4 1005
c71099ac 1006 table = rt->rt6i_table;
66f5d6ce 1007 spin_lock_bh(&table->tb6_lock);
333c4301 1008 err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
66f5d6ce 1009 spin_unlock_bh(&table->tb6_lock);
1da177e4
LT
1010
1011 return err;
1012}
1013
40e22e8f
TG
1014int ip6_ins_rt(struct rt6_info *rt)
1015{
e715b6d3
FW
1016 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1017 struct mx6_config mxc = { .mx = NULL, };
1018
1cfb71ee
WW
1019 /* Hold dst to account for the reference from the fib6 tree */
1020 dst_hold(&rt->dst);
333c4301 1021 return __ip6_ins_rt(rt, &info, &mxc, NULL);
40e22e8f
TG
1022}
1023
4832c30d
DA
1024/* called with rcu_lock held */
1025static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1026{
1027 struct net_device *dev = rt->dst.dev;
1028
98d11291 1029 if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
4832c30d
DA
1030 /* for copies of local routes, dst->dev needs to be the
1031 * device if it is a master device, the master device if
1032 * device is enslaved, and the loopback as the default
1033 */
1034 if (netif_is_l3_slave(dev) &&
1035 !rt6_need_strict(&rt->rt6i_dst.addr))
1036 dev = l3mdev_master_dev_rcu(dev);
1037 else if (!netif_is_l3_master(dev))
1038 dev = dev_net(dev)->loopback_dev;
1039 /* last case is netif_is_l3_master(dev) is true in which
1040 * case we want dev returned to be dev
1041 */
1042 }
1043
1044 return dev;
1045}
1046
8b9df265
MKL
1047static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1048 const struct in6_addr *daddr,
1049 const struct in6_addr *saddr)
1da177e4 1050{
4832c30d 1051 struct net_device *dev;
1da177e4
LT
1052 struct rt6_info *rt;
1053
1054 /*
1055 * Clone the route.
1056 */
1057
d52d3997 1058 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
3a2232e9 1059 ort = ort->from;
1da177e4 1060
4832c30d
DA
1061 rcu_read_lock();
1062 dev = ip6_rt_get_dev_rcu(ort);
1063 rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1064 rcu_read_unlock();
83a09abd
MKL
1065 if (!rt)
1066 return NULL;
1067
1068 ip6_rt_copy_init(rt, ort);
1069 rt->rt6i_flags |= RTF_CACHE;
1070 rt->rt6i_metric = 0;
1071 rt->dst.flags |= DST_HOST;
1072 rt->rt6i_dst.addr = *daddr;
1073 rt->rt6i_dst.plen = 128;
1da177e4 1074
83a09abd
MKL
1075 if (!rt6_is_gw_or_nonexthop(ort)) {
1076 if (ort->rt6i_dst.plen != 128 &&
1077 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1078 rt->rt6i_flags |= RTF_ANYCAST;
1da177e4 1079#ifdef CONFIG_IPV6_SUBTREES
83a09abd
MKL
1080 if (rt->rt6i_src.plen && saddr) {
1081 rt->rt6i_src.addr = *saddr;
1082 rt->rt6i_src.plen = 128;
8b9df265 1083 }
83a09abd 1084#endif
95a9a5ba 1085 }
1da177e4 1086
95a9a5ba
YH
1087 return rt;
1088}
1da177e4 1089
d52d3997
MKL
1090static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1091{
4832c30d 1092 struct net_device *dev;
d52d3997
MKL
1093 struct rt6_info *pcpu_rt;
1094
4832c30d
DA
1095 rcu_read_lock();
1096 dev = ip6_rt_get_dev_rcu(rt);
1097 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1098 rcu_read_unlock();
d52d3997
MKL
1099 if (!pcpu_rt)
1100 return NULL;
1101 ip6_rt_copy_init(pcpu_rt, rt);
1102 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1103 pcpu_rt->rt6i_flags |= RTF_PCPU;
1104 return pcpu_rt;
1105}
1106
66f5d6ce 1107/* It should be called with rcu_read_lock() acquired */
d52d3997
MKL
1108static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1109{
a73e4195 1110 struct rt6_info *pcpu_rt, **p;
d52d3997
MKL
1111
1112 p = this_cpu_ptr(rt->rt6i_pcpu);
1113 pcpu_rt = *p;
1114
d3843fe5 1115 if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
a73e4195 1116 rt6_dst_from_metrics_check(pcpu_rt);
d3843fe5 1117
a73e4195
MKL
1118 return pcpu_rt;
1119}
1120
1121static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1122{
1123 struct rt6_info *pcpu_rt, *prev, **p;
d52d3997
MKL
1124
1125 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1126 if (!pcpu_rt) {
1127 struct net *net = dev_net(rt->dst.dev);
1128
9c7370a1
MKL
1129 dst_hold(&net->ipv6.ip6_null_entry->dst);
1130 return net->ipv6.ip6_null_entry;
d52d3997
MKL
1131 }
1132
a94b9367
WW
1133 dst_hold(&pcpu_rt->dst);
1134 p = this_cpu_ptr(rt->rt6i_pcpu);
1135 prev = cmpxchg(p, NULL, pcpu_rt);
951f788a 1136 BUG_ON(prev);
a94b9367 1137
d52d3997
MKL
1138 rt6_dst_from_metrics_check(pcpu_rt);
1139 return pcpu_rt;
1140}
1141
35732d01
WW
1142/* exception hash table implementation
1143 */
1144static DEFINE_SPINLOCK(rt6_exception_lock);
1145
1146/* Remove rt6_ex from hash table and free the memory
1147 * Caller must hold rt6_exception_lock
1148 */
1149static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1150 struct rt6_exception *rt6_ex)
1151{
b2427e67 1152 struct net *net;
81eb8447 1153
35732d01
WW
1154 if (!bucket || !rt6_ex)
1155 return;
b2427e67
CIK
1156
1157 net = dev_net(rt6_ex->rt6i->dst.dev);
35732d01
WW
1158 rt6_ex->rt6i->rt6i_node = NULL;
1159 hlist_del_rcu(&rt6_ex->hlist);
1160 rt6_release(rt6_ex->rt6i);
1161 kfree_rcu(rt6_ex, rcu);
1162 WARN_ON_ONCE(!bucket->depth);
1163 bucket->depth--;
81eb8447 1164 net->ipv6.rt6_stats->fib_rt_cache--;
35732d01
WW
1165}
1166
1167/* Remove oldest rt6_ex in bucket and free the memory
1168 * Caller must hold rt6_exception_lock
1169 */
1170static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1171{
1172 struct rt6_exception *rt6_ex, *oldest = NULL;
1173
1174 if (!bucket)
1175 return;
1176
1177 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1178 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1179 oldest = rt6_ex;
1180 }
1181 rt6_remove_exception(bucket, oldest);
1182}
1183
1184static u32 rt6_exception_hash(const struct in6_addr *dst,
1185 const struct in6_addr *src)
1186{
1187 static u32 seed __read_mostly;
1188 u32 val;
1189
1190 net_get_random_once(&seed, sizeof(seed));
1191 val = jhash(dst, sizeof(*dst), seed);
1192
1193#ifdef CONFIG_IPV6_SUBTREES
1194 if (src)
1195 val = jhash(src, sizeof(*src), val);
1196#endif
1197 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1198}
1199
1200/* Helper function to find the cached rt in the hash table
1201 * and update bucket pointer to point to the bucket for this
1202 * (daddr, saddr) pair
1203 * Caller must hold rt6_exception_lock
1204 */
1205static struct rt6_exception *
1206__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1207 const struct in6_addr *daddr,
1208 const struct in6_addr *saddr)
1209{
1210 struct rt6_exception *rt6_ex;
1211 u32 hval;
1212
1213 if (!(*bucket) || !daddr)
1214 return NULL;
1215
1216 hval = rt6_exception_hash(daddr, saddr);
1217 *bucket += hval;
1218
1219 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1220 struct rt6_info *rt6 = rt6_ex->rt6i;
1221 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1222
1223#ifdef CONFIG_IPV6_SUBTREES
1224 if (matched && saddr)
1225 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1226#endif
1227 if (matched)
1228 return rt6_ex;
1229 }
1230 return NULL;
1231}
1232
1233/* Helper function to find the cached rt in the hash table
1234 * and update bucket pointer to point to the bucket for this
1235 * (daddr, saddr) pair
1236 * Caller must hold rcu_read_lock()
1237 */
1238static struct rt6_exception *
1239__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1240 const struct in6_addr *daddr,
1241 const struct in6_addr *saddr)
1242{
1243 struct rt6_exception *rt6_ex;
1244 u32 hval;
1245
1246 WARN_ON_ONCE(!rcu_read_lock_held());
1247
1248 if (!(*bucket) || !daddr)
1249 return NULL;
1250
1251 hval = rt6_exception_hash(daddr, saddr);
1252 *bucket += hval;
1253
1254 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1255 struct rt6_info *rt6 = rt6_ex->rt6i;
1256 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1257
1258#ifdef CONFIG_IPV6_SUBTREES
1259 if (matched && saddr)
1260 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1261#endif
1262 if (matched)
1263 return rt6_ex;
1264 }
1265 return NULL;
1266}
1267
1268static int rt6_insert_exception(struct rt6_info *nrt,
1269 struct rt6_info *ort)
1270{
81eb8447 1271 struct net *net = dev_net(ort->dst.dev);
35732d01
WW
1272 struct rt6_exception_bucket *bucket;
1273 struct in6_addr *src_key = NULL;
1274 struct rt6_exception *rt6_ex;
1275 int err = 0;
1276
1277 /* ort can't be a cache or pcpu route */
1278 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
3a2232e9 1279 ort = ort->from;
35732d01
WW
1280 WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1281
1282 spin_lock_bh(&rt6_exception_lock);
1283
1284 if (ort->exception_bucket_flushed) {
1285 err = -EINVAL;
1286 goto out;
1287 }
1288
1289 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1290 lockdep_is_held(&rt6_exception_lock));
1291 if (!bucket) {
1292 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1293 GFP_ATOMIC);
1294 if (!bucket) {
1295 err = -ENOMEM;
1296 goto out;
1297 }
1298 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1299 }
1300
1301#ifdef CONFIG_IPV6_SUBTREES
1302 /* rt6i_src.plen != 0 indicates ort is in subtree
1303 * and exception table is indexed by a hash of
1304 * both rt6i_dst and rt6i_src.
1305 * Otherwise, the exception table is indexed by
1306 * a hash of only rt6i_dst.
1307 */
1308 if (ort->rt6i_src.plen)
1309 src_key = &nrt->rt6i_src.addr;
1310#endif
60006a48
WW
1311
1312 /* Update rt6i_prefsrc as it could be changed
1313 * in rt6_remove_prefsrc()
1314 */
1315 nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
f5bbe7ee
WW
1316 /* rt6_mtu_change() might lower mtu on ort.
1317 * Only insert this exception route if its mtu
1318 * is less than ort's mtu value.
1319 */
1320 if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1321 err = -EINVAL;
1322 goto out;
1323 }
60006a48 1324
35732d01
WW
1325 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1326 src_key);
1327 if (rt6_ex)
1328 rt6_remove_exception(bucket, rt6_ex);
1329
1330 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1331 if (!rt6_ex) {
1332 err = -ENOMEM;
1333 goto out;
1334 }
1335 rt6_ex->rt6i = nrt;
1336 rt6_ex->stamp = jiffies;
1337 atomic_inc(&nrt->rt6i_ref);
1338 nrt->rt6i_node = ort->rt6i_node;
1339 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1340 bucket->depth++;
81eb8447 1341 net->ipv6.rt6_stats->fib_rt_cache++;
35732d01
WW
1342
1343 if (bucket->depth > FIB6_MAX_DEPTH)
1344 rt6_exception_remove_oldest(bucket);
1345
1346out:
1347 spin_unlock_bh(&rt6_exception_lock);
1348
1349 /* Update fn->fn_sernum to invalidate all cached dst */
b886d5f2 1350 if (!err) {
922c2ac8 1351 spin_lock_bh(&ort->rt6i_table->tb6_lock);
35732d01 1352 fib6_update_sernum(ort);
922c2ac8 1353 spin_unlock_bh(&ort->rt6i_table->tb6_lock);
b886d5f2
PA
1354 fib6_force_start_gc(net);
1355 }
35732d01
WW
1356
1357 return err;
1358}
1359
1360void rt6_flush_exceptions(struct rt6_info *rt)
1361{
1362 struct rt6_exception_bucket *bucket;
1363 struct rt6_exception *rt6_ex;
1364 struct hlist_node *tmp;
1365 int i;
1366
1367 spin_lock_bh(&rt6_exception_lock);
1368 /* Prevent rt6_insert_exception() to recreate the bucket list */
1369 rt->exception_bucket_flushed = 1;
1370
1371 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1372 lockdep_is_held(&rt6_exception_lock));
1373 if (!bucket)
1374 goto out;
1375
1376 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1377 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1378 rt6_remove_exception(bucket, rt6_ex);
1379 WARN_ON_ONCE(bucket->depth);
1380 bucket++;
1381 }
1382
1383out:
1384 spin_unlock_bh(&rt6_exception_lock);
1385}
1386
1387/* Find cached rt in the hash table inside passed in rt
1388 * Caller has to hold rcu_read_lock()
1389 */
1390static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1391 struct in6_addr *daddr,
1392 struct in6_addr *saddr)
1393{
1394 struct rt6_exception_bucket *bucket;
1395 struct in6_addr *src_key = NULL;
1396 struct rt6_exception *rt6_ex;
1397 struct rt6_info *res = NULL;
1398
1399 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1400
1401#ifdef CONFIG_IPV6_SUBTREES
1402 /* rt6i_src.plen != 0 indicates rt is in subtree
1403 * and exception table is indexed by a hash of
1404 * both rt6i_dst and rt6i_src.
1405 * Otherwise, the exception table is indexed by
1406 * a hash of only rt6i_dst.
1407 */
1408 if (rt->rt6i_src.plen)
1409 src_key = saddr;
1410#endif
1411 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1412
1413 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1414 res = rt6_ex->rt6i;
1415
1416 return res;
1417}
1418
1419/* Remove the passed in cached rt from the hash table that contains it */
1420int rt6_remove_exception_rt(struct rt6_info *rt)
1421{
35732d01 1422 struct rt6_exception_bucket *bucket;
3a2232e9 1423 struct rt6_info *from = rt->from;
35732d01
WW
1424 struct in6_addr *src_key = NULL;
1425 struct rt6_exception *rt6_ex;
1426 int err;
1427
1428 if (!from ||
442d713b 1429 !(rt->rt6i_flags & RTF_CACHE))
35732d01
WW
1430 return -EINVAL;
1431
1432 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1433 return -ENOENT;
1434
1435 spin_lock_bh(&rt6_exception_lock);
1436 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1437 lockdep_is_held(&rt6_exception_lock));
1438#ifdef CONFIG_IPV6_SUBTREES
1439 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1440 * and exception table is indexed by a hash of
1441 * both rt6i_dst and rt6i_src.
1442 * Otherwise, the exception table is indexed by
1443 * a hash of only rt6i_dst.
1444 */
1445 if (from->rt6i_src.plen)
1446 src_key = &rt->rt6i_src.addr;
1447#endif
1448 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1449 &rt->rt6i_dst.addr,
1450 src_key);
1451 if (rt6_ex) {
1452 rt6_remove_exception(bucket, rt6_ex);
1453 err = 0;
1454 } else {
1455 err = -ENOENT;
1456 }
1457
1458 spin_unlock_bh(&rt6_exception_lock);
1459 return err;
1460}
1461
1462/* Find rt6_ex which contains the passed in rt cache and
1463 * refresh its stamp
1464 */
1465static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1466{
35732d01 1467 struct rt6_exception_bucket *bucket;
3a2232e9 1468 struct rt6_info *from = rt->from;
35732d01
WW
1469 struct in6_addr *src_key = NULL;
1470 struct rt6_exception *rt6_ex;
1471
1472 if (!from ||
442d713b 1473 !(rt->rt6i_flags & RTF_CACHE))
35732d01
WW
1474 return;
1475
1476 rcu_read_lock();
1477 bucket = rcu_dereference(from->rt6i_exception_bucket);
1478
1479#ifdef CONFIG_IPV6_SUBTREES
1480 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1481 * and exception table is indexed by a hash of
1482 * both rt6i_dst and rt6i_src.
1483 * Otherwise, the exception table is indexed by
1484 * a hash of only rt6i_dst.
1485 */
1486 if (from->rt6i_src.plen)
1487 src_key = &rt->rt6i_src.addr;
1488#endif
1489 rt6_ex = __rt6_find_exception_rcu(&bucket,
1490 &rt->rt6i_dst.addr,
1491 src_key);
1492 if (rt6_ex)
1493 rt6_ex->stamp = jiffies;
1494
1495 rcu_read_unlock();
1496}
1497
60006a48
WW
1498static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1499{
1500 struct rt6_exception_bucket *bucket;
1501 struct rt6_exception *rt6_ex;
1502 int i;
1503
1504 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1505 lockdep_is_held(&rt6_exception_lock));
1506
1507 if (bucket) {
1508 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1509 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1510 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1511 }
1512 bucket++;
1513 }
1514 }
1515}
1516
e9fa1495
SB
1517static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1518 struct rt6_info *rt, int mtu)
1519{
1520 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1521 * lowest MTU in the path: always allow updating the route PMTU to
1522 * reflect PMTU decreases.
1523 *
1524 * If the new MTU is higher, and the route PMTU is equal to the local
1525 * MTU, this means the old MTU is the lowest in the path, so allow
1526 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1527 * handle this.
1528 */
1529
1530 if (dst_mtu(&rt->dst) >= mtu)
1531 return true;
1532
1533 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1534 return true;
1535
1536 return false;
1537}
1538
1539static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1540 struct rt6_info *rt, int mtu)
f5bbe7ee
WW
1541{
1542 struct rt6_exception_bucket *bucket;
1543 struct rt6_exception *rt6_ex;
1544 int i;
1545
1546 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1547 lockdep_is_held(&rt6_exception_lock));
1548
e9fa1495
SB
1549 if (!bucket)
1550 return;
1551
1552 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1553 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1554 struct rt6_info *entry = rt6_ex->rt6i;
1555
1556 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1557 * route), the metrics of its rt->dst.from have already
1558 * been updated.
1559 */
1560 if (entry->rt6i_pmtu &&
1561 rt6_mtu_change_route_allowed(idev, entry, mtu))
1562 entry->rt6i_pmtu = mtu;
f5bbe7ee 1563 }
e9fa1495 1564 bucket++;
f5bbe7ee
WW
1565 }
1566}
1567
b16cb459
WW
1568#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1569
1570static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1571 struct in6_addr *gateway)
1572{
1573 struct rt6_exception_bucket *bucket;
1574 struct rt6_exception *rt6_ex;
1575 struct hlist_node *tmp;
1576 int i;
1577
1578 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1579 return;
1580
1581 spin_lock_bh(&rt6_exception_lock);
1582 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1583 lockdep_is_held(&rt6_exception_lock));
1584
1585 if (bucket) {
1586 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1587 hlist_for_each_entry_safe(rt6_ex, tmp,
1588 &bucket->chain, hlist) {
1589 struct rt6_info *entry = rt6_ex->rt6i;
1590
1591 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1592 RTF_CACHE_GATEWAY &&
1593 ipv6_addr_equal(gateway,
1594 &entry->rt6i_gateway)) {
1595 rt6_remove_exception(bucket, rt6_ex);
1596 }
1597 }
1598 bucket++;
1599 }
1600 }
1601
1602 spin_unlock_bh(&rt6_exception_lock);
1603}
1604
c757faa8
WW
1605static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1606 struct rt6_exception *rt6_ex,
1607 struct fib6_gc_args *gc_args,
1608 unsigned long now)
1609{
1610 struct rt6_info *rt = rt6_ex->rt6i;
1611
1859bac0
PA
1612 /* we are pruning and obsoleting aged-out and non gateway exceptions
1613 * even if others have still references to them, so that on next
1614 * dst_check() such references can be dropped.
1615 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1616 * expired, independently from their aging, as per RFC 8201 section 4
1617 */
31afeb42
WW
1618 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1619 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1620 RT6_TRACE("aging clone %p\n", rt);
1621 rt6_remove_exception(bucket, rt6_ex);
1622 return;
1623 }
1624 } else if (time_after(jiffies, rt->dst.expires)) {
1625 RT6_TRACE("purging expired route %p\n", rt);
c757faa8
WW
1626 rt6_remove_exception(bucket, rt6_ex);
1627 return;
31afeb42
WW
1628 }
1629
1630 if (rt->rt6i_flags & RTF_GATEWAY) {
c757faa8
WW
1631 struct neighbour *neigh;
1632 __u8 neigh_flags = 0;
1633
1634 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1635 if (neigh) {
1636 neigh_flags = neigh->flags;
1637 neigh_release(neigh);
1638 }
1639 if (!(neigh_flags & NTF_ROUTER)) {
1640 RT6_TRACE("purging route %p via non-router but gateway\n",
1641 rt);
1642 rt6_remove_exception(bucket, rt6_ex);
1643 return;
1644 }
1645 }
31afeb42 1646
c757faa8
WW
1647 gc_args->more++;
1648}
1649
1650void rt6_age_exceptions(struct rt6_info *rt,
1651 struct fib6_gc_args *gc_args,
1652 unsigned long now)
1653{
1654 struct rt6_exception_bucket *bucket;
1655 struct rt6_exception *rt6_ex;
1656 struct hlist_node *tmp;
1657 int i;
1658
1659 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1660 return;
1661
1662 spin_lock_bh(&rt6_exception_lock);
1663 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1664 lockdep_is_held(&rt6_exception_lock));
1665
1666 if (bucket) {
1667 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1668 hlist_for_each_entry_safe(rt6_ex, tmp,
1669 &bucket->chain, hlist) {
1670 rt6_age_examine_exception(bucket, rt6_ex,
1671 gc_args, now);
1672 }
1673 bucket++;
1674 }
1675 }
1676 spin_unlock_bh(&rt6_exception_lock);
1677}
1678
9ff74384 1679struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
b75cc8f9
DA
1680 int oif, struct flowi6 *fl6,
1681 const struct sk_buff *skb, int flags)
1da177e4 1682{
367efcb9 1683 struct fib6_node *fn, *saved_fn;
2b760fcf 1684 struct rt6_info *rt, *rt_cache;
c71099ac 1685 int strict = 0;
1da177e4 1686
77d16f45 1687 strict |= flags & RT6_LOOKUP_F_IFACE;
d5d32e4b 1688 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
367efcb9
MKL
1689 if (net->ipv6.devconf_all->forwarding == 0)
1690 strict |= RT6_LOOKUP_F_REACHABLE;
1da177e4 1691
66f5d6ce 1692 rcu_read_lock();
1da177e4 1693
4c9483b2 1694 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
367efcb9 1695 saved_fn = fn;
1da177e4 1696
ca254490
DA
1697 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1698 oif = 0;
1699
a3c00e46 1700redo_rt6_select:
8d1040e8 1701 rt = rt6_select(net, fn, oif, strict);
52bd4c0c 1702 if (rt->rt6i_nsiblings)
b4bac172 1703 rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
a3c00e46
MKL
1704 if (rt == net->ipv6.ip6_null_entry) {
1705 fn = fib6_backtrack(fn, &fl6->saddr);
1706 if (fn)
1707 goto redo_rt6_select;
367efcb9
MKL
1708 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1709 /* also consider unreachable route */
1710 strict &= ~RT6_LOOKUP_F_REACHABLE;
1711 fn = saved_fn;
1712 goto redo_rt6_select;
367efcb9 1713 }
a3c00e46
MKL
1714 }
1715
2b760fcf
WW
1716 /*Search through exception table */
1717 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1718 if (rt_cache)
1719 rt = rt_cache;
fb9de91e 1720
d3843fe5 1721 if (rt == net->ipv6.ip6_null_entry) {
66f5d6ce 1722 rcu_read_unlock();
d3843fe5 1723 dst_hold(&rt->dst);
b65f164d 1724 trace_fib6_table_lookup(net, rt, table, fl6);
d3843fe5
WW
1725 return rt;
1726 } else if (rt->rt6i_flags & RTF_CACHE) {
1727 if (ip6_hold_safe(net, &rt, true)) {
1728 dst_use_noref(&rt->dst, jiffies);
1729 rt6_dst_from_metrics_check(rt);
1730 }
66f5d6ce 1731 rcu_read_unlock();
b65f164d 1732 trace_fib6_table_lookup(net, rt, table, fl6);
d52d3997 1733 return rt;
3da59bd9
MKL
1734 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1735 !(rt->rt6i_flags & RTF_GATEWAY))) {
1736 /* Create a RTF_CACHE clone which will not be
1737 * owned by the fib6 tree. It is for the special case where
1738 * the daddr in the skb during the neighbor look-up is different
1739 * from the fl6->daddr used to look-up route here.
1740 */
1741
1742 struct rt6_info *uncached_rt;
1743
d3843fe5
WW
1744 if (ip6_hold_safe(net, &rt, true)) {
1745 dst_use_noref(&rt->dst, jiffies);
1746 } else {
66f5d6ce 1747 rcu_read_unlock();
d3843fe5
WW
1748 uncached_rt = rt;
1749 goto uncached_rt_out;
1750 }
66f5d6ce 1751 rcu_read_unlock();
d52d3997 1752
3da59bd9
MKL
1753 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1754 dst_release(&rt->dst);
c71099ac 1755
1cfb71ee
WW
1756 if (uncached_rt) {
1757 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1758 * No need for another dst_hold()
1759 */
8d0b94af 1760 rt6_uncached_list_add(uncached_rt);
81eb8447 1761 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1cfb71ee 1762 } else {
3da59bd9 1763 uncached_rt = net->ipv6.ip6_null_entry;
1cfb71ee
WW
1764 dst_hold(&uncached_rt->dst);
1765 }
b811580d 1766
d3843fe5 1767uncached_rt_out:
b65f164d 1768 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
3da59bd9 1769 return uncached_rt;
3da59bd9 1770
d52d3997
MKL
1771 } else {
1772 /* Get a percpu copy */
1773
1774 struct rt6_info *pcpu_rt;
1775
d3843fe5 1776 dst_use_noref(&rt->dst, jiffies);
951f788a 1777 local_bh_disable();
d52d3997 1778 pcpu_rt = rt6_get_pcpu_route(rt);
d52d3997 1779
951f788a 1780 if (!pcpu_rt) {
a94b9367
WW
1781 /* atomic_inc_not_zero() is needed when using rcu */
1782 if (atomic_inc_not_zero(&rt->rt6i_ref)) {
951f788a 1783 /* No dst_hold() on rt is needed because grabbing
a94b9367
WW
1784 * rt->rt6i_ref makes sure rt can't be released.
1785 */
a94b9367
WW
1786 pcpu_rt = rt6_make_pcpu_route(rt);
1787 rt6_release(rt);
1788 } else {
1789 /* rt is already removed from tree */
a94b9367
WW
1790 pcpu_rt = net->ipv6.ip6_null_entry;
1791 dst_hold(&pcpu_rt->dst);
1792 }
9c7370a1 1793 }
951f788a
ED
1794 local_bh_enable();
1795 rcu_read_unlock();
b65f164d 1796 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
d52d3997
MKL
1797 return pcpu_rt;
1798 }
1da177e4 1799}
9ff74384 1800EXPORT_SYMBOL_GPL(ip6_pol_route);
1da177e4 1801
b75cc8f9
DA
1802static struct rt6_info *ip6_pol_route_input(struct net *net,
1803 struct fib6_table *table,
1804 struct flowi6 *fl6,
1805 const struct sk_buff *skb,
1806 int flags)
4acad72d 1807{
b75cc8f9 1808 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
4acad72d
PE
1809}
1810
d409b847
MB
1811struct dst_entry *ip6_route_input_lookup(struct net *net,
1812 struct net_device *dev,
b75cc8f9
DA
1813 struct flowi6 *fl6,
1814 const struct sk_buff *skb,
1815 int flags)
72331bc0
SL
1816{
1817 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1818 flags |= RT6_LOOKUP_F_IFACE;
1819
b75cc8f9 1820 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
72331bc0 1821}
d409b847 1822EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
72331bc0 1823
23aebdac 1824static void ip6_multipath_l3_keys(const struct sk_buff *skb,
5e5d6fed
RP
1825 struct flow_keys *keys,
1826 struct flow_keys *flkeys)
23aebdac
JS
1827{
1828 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1829 const struct ipv6hdr *key_iph = outer_iph;
5e5d6fed 1830 struct flow_keys *_flkeys = flkeys;
23aebdac
JS
1831 const struct ipv6hdr *inner_iph;
1832 const struct icmp6hdr *icmph;
1833 struct ipv6hdr _inner_iph;
1834
1835 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1836 goto out;
1837
1838 icmph = icmp6_hdr(skb);
1839 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1840 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1841 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1842 icmph->icmp6_type != ICMPV6_PARAMPROB)
1843 goto out;
1844
1845 inner_iph = skb_header_pointer(skb,
1846 skb_transport_offset(skb) + sizeof(*icmph),
1847 sizeof(_inner_iph), &_inner_iph);
1848 if (!inner_iph)
1849 goto out;
1850
1851 key_iph = inner_iph;
5e5d6fed 1852 _flkeys = NULL;
23aebdac 1853out:
5e5d6fed
RP
1854 if (_flkeys) {
1855 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1856 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1857 keys->tags.flow_label = _flkeys->tags.flow_label;
1858 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1859 } else {
1860 keys->addrs.v6addrs.src = key_iph->saddr;
1861 keys->addrs.v6addrs.dst = key_iph->daddr;
1862 keys->tags.flow_label = ip6_flowinfo(key_iph);
1863 keys->basic.ip_proto = key_iph->nexthdr;
1864 }
23aebdac
JS
1865}
1866
1867/* if skb is set it will be used and fl6 can be NULL */
b4bac172
DA
1868u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1869 const struct sk_buff *skb, struct flow_keys *flkeys)
23aebdac
JS
1870{
1871 struct flow_keys hash_keys;
9a2a537a 1872 u32 mhash;
23aebdac 1873
bbfa047a 1874 switch (ip6_multipath_hash_policy(net)) {
b4bac172
DA
1875 case 0:
1876 memset(&hash_keys, 0, sizeof(hash_keys));
1877 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1878 if (skb) {
1879 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1880 } else {
1881 hash_keys.addrs.v6addrs.src = fl6->saddr;
1882 hash_keys.addrs.v6addrs.dst = fl6->daddr;
1883 hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1884 hash_keys.basic.ip_proto = fl6->flowi6_proto;
1885 }
1886 break;
1887 case 1:
1888 if (skb) {
1889 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1890 struct flow_keys keys;
1891
1892 /* short-circuit if we already have L4 hash present */
1893 if (skb->l4_hash)
1894 return skb_get_hash_raw(skb) >> 1;
1895
1896 memset(&hash_keys, 0, sizeof(hash_keys));
1897
1898 if (!flkeys) {
1899 skb_flow_dissect_flow_keys(skb, &keys, flag);
1900 flkeys = &keys;
1901 }
1902 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1903 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
1904 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
1905 hash_keys.ports.src = flkeys->ports.src;
1906 hash_keys.ports.dst = flkeys->ports.dst;
1907 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1908 } else {
1909 memset(&hash_keys, 0, sizeof(hash_keys));
1910 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1911 hash_keys.addrs.v6addrs.src = fl6->saddr;
1912 hash_keys.addrs.v6addrs.dst = fl6->daddr;
1913 hash_keys.ports.src = fl6->fl6_sport;
1914 hash_keys.ports.dst = fl6->fl6_dport;
1915 hash_keys.basic.ip_proto = fl6->flowi6_proto;
1916 }
1917 break;
23aebdac 1918 }
9a2a537a 1919 mhash = flow_hash_from_keys(&hash_keys);
23aebdac 1920
9a2a537a 1921 return mhash >> 1;
23aebdac
JS
1922}
1923
c71099ac
TG
1924void ip6_route_input(struct sk_buff *skb)
1925{
b71d1d42 1926 const struct ipv6hdr *iph = ipv6_hdr(skb);
c346dca1 1927 struct net *net = dev_net(skb->dev);
adaa70bb 1928 int flags = RT6_LOOKUP_F_HAS_SADDR;
904af04d 1929 struct ip_tunnel_info *tun_info;
4c9483b2 1930 struct flowi6 fl6 = {
e0d56fdd 1931 .flowi6_iif = skb->dev->ifindex,
4c9483b2
DM
1932 .daddr = iph->daddr,
1933 .saddr = iph->saddr,
6502ca52 1934 .flowlabel = ip6_flowinfo(iph),
4c9483b2
DM
1935 .flowi6_mark = skb->mark,
1936 .flowi6_proto = iph->nexthdr,
c71099ac 1937 };
5e5d6fed 1938 struct flow_keys *flkeys = NULL, _flkeys;
adaa70bb 1939
904af04d 1940 tun_info = skb_tunnel_info(skb);
46fa062a 1941 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
904af04d 1942 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
5e5d6fed
RP
1943
1944 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
1945 flkeys = &_flkeys;
1946
23aebdac 1947 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
b4bac172 1948 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
06e9d040 1949 skb_dst_drop(skb);
b75cc8f9
DA
1950 skb_dst_set(skb,
1951 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
c71099ac
TG
1952}
1953
b75cc8f9
DA
1954static struct rt6_info *ip6_pol_route_output(struct net *net,
1955 struct fib6_table *table,
1956 struct flowi6 *fl6,
1957 const struct sk_buff *skb,
1958 int flags)
1da177e4 1959{
b75cc8f9 1960 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
c71099ac
TG
1961}
1962
6f21c96a
PA
1963struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1964 struct flowi6 *fl6, int flags)
c71099ac 1965{
d46a9d67 1966 bool any_src;
c71099ac 1967
4c1feac5
DA
1968 if (rt6_need_strict(&fl6->daddr)) {
1969 struct dst_entry *dst;
1970
1971 dst = l3mdev_link_scope_lookup(net, fl6);
1972 if (dst)
1973 return dst;
1974 }
ca254490 1975
1fb9489b 1976 fl6->flowi6_iif = LOOPBACK_IFINDEX;
4dc27d1c 1977
d46a9d67 1978 any_src = ipv6_addr_any(&fl6->saddr);
741a11d9 1979 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
d46a9d67 1980 (fl6->flowi6_oif && any_src))
77d16f45 1981 flags |= RT6_LOOKUP_F_IFACE;
c71099ac 1982
d46a9d67 1983 if (!any_src)
adaa70bb 1984 flags |= RT6_LOOKUP_F_HAS_SADDR;
0c9a2ac1
YH
1985 else if (sk)
1986 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
adaa70bb 1987
b75cc8f9 1988 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
1da177e4 1989}
6f21c96a 1990EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1da177e4 1991
2774c131 1992struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 1993{
5c1e6aa3 1994 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1dbe3252 1995 struct net_device *loopback_dev = net->loopback_dev;
14e50e57
DM
1996 struct dst_entry *new = NULL;
1997
1dbe3252 1998 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
62cf27e5 1999 DST_OBSOLETE_DEAD, 0);
14e50e57 2000 if (rt) {
0a1f5962 2001 rt6_info_init(rt);
81eb8447 2002 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
8104891b 2003
0a1f5962 2004 new = &rt->dst;
14e50e57 2005 new->__use = 1;
352e512c 2006 new->input = dst_discard;
ede2059d 2007 new->output = dst_discard_out;
14e50e57 2008
0a1f5962 2009 dst_copy_metrics(new, &ort->dst);
14e50e57 2010
1dbe3252 2011 rt->rt6i_idev = in6_dev_get(loopback_dev);
4e3fd7a0 2012 rt->rt6i_gateway = ort->rt6i_gateway;
0a1f5962 2013 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
14e50e57
DM
2014 rt->rt6i_metric = 0;
2015
2016 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2017#ifdef CONFIG_IPV6_SUBTREES
2018 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2019#endif
14e50e57
DM
2020 }
2021
69ead7af
DM
2022 dst_release(dst_orig);
2023 return new ? new : ERR_PTR(-ENOMEM);
14e50e57 2024}
14e50e57 2025
1da177e4
LT
2026/*
2027 * Destination cache support functions
2028 */
2029
4b32b5ad
MKL
2030static void rt6_dst_from_metrics_check(struct rt6_info *rt)
2031{
3a2232e9
DM
2032 if (rt->from &&
2033 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
2034 dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
4b32b5ad
MKL
2035}
2036
3da59bd9
MKL
2037static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
2038{
36143645 2039 u32 rt_cookie = 0;
c5cff856
WW
2040
2041 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
3da59bd9
MKL
2042 return NULL;
2043
2044 if (rt6_check_expired(rt))
2045 return NULL;
2046
2047 return &rt->dst;
2048}
2049
2050static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
2051{
5973fb1e
MKL
2052 if (!__rt6_check_expired(rt) &&
2053 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
3a2232e9 2054 rt6_check(rt->from, cookie))
3da59bd9
MKL
2055 return &rt->dst;
2056 else
2057 return NULL;
2058}
2059
1da177e4
LT
2060static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2061{
2062 struct rt6_info *rt;
2063
2064 rt = (struct rt6_info *) dst;
2065
6f3118b5
ND
2066 /* All IPV6 dsts are created with ->obsolete set to the value
2067 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2068 * into this function always.
2069 */
e3bc10bd 2070
4b32b5ad
MKL
2071 rt6_dst_from_metrics_check(rt);
2072
02bcf4e0 2073 if (rt->rt6i_flags & RTF_PCPU ||
3a2232e9 2074 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
3da59bd9
MKL
2075 return rt6_dst_from_check(rt, cookie);
2076 else
2077 return rt6_check(rt, cookie);
1da177e4
LT
2078}
2079
2080static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2081{
2082 struct rt6_info *rt = (struct rt6_info *) dst;
2083
2084 if (rt) {
54c1a859
YH
2085 if (rt->rt6i_flags & RTF_CACHE) {
2086 if (rt6_check_expired(rt)) {
2087 ip6_del_rt(rt);
2088 dst = NULL;
2089 }
2090 } else {
1da177e4 2091 dst_release(dst);
54c1a859
YH
2092 dst = NULL;
2093 }
1da177e4 2094 }
54c1a859 2095 return dst;
1da177e4
LT
2096}
2097
2098static void ip6_link_failure(struct sk_buff *skb)
2099{
2100 struct rt6_info *rt;
2101
3ffe533c 2102 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1da177e4 2103
adf30907 2104 rt = (struct rt6_info *) skb_dst(skb);
1da177e4 2105 if (rt) {
1eb4f758 2106 if (rt->rt6i_flags & RTF_CACHE) {
ad65a2f0
WW
2107 if (dst_hold_safe(&rt->dst))
2108 ip6_del_rt(rt);
c5cff856
WW
2109 } else {
2110 struct fib6_node *fn;
2111
2112 rcu_read_lock();
2113 fn = rcu_dereference(rt->rt6i_node);
2114 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2115 fn->fn_sernum = -1;
2116 rcu_read_unlock();
1eb4f758 2117 }
1da177e4
LT
2118 }
2119}
2120
45e4fd26
MKL
2121static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2122{
2123 struct net *net = dev_net(rt->dst.dev);
2124
2125 rt->rt6i_flags |= RTF_MODIFIED;
2126 rt->rt6i_pmtu = mtu;
2127 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2128}
2129
0d3f6d29
MKL
2130static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2131{
2132 return !(rt->rt6i_flags & RTF_CACHE) &&
4e587ea7
WW
2133 (rt->rt6i_flags & RTF_PCPU ||
2134 rcu_access_pointer(rt->rt6i_node));
0d3f6d29
MKL
2135}
2136
45e4fd26
MKL
2137static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2138 const struct ipv6hdr *iph, u32 mtu)
1da177e4 2139{
0dec879f 2140 const struct in6_addr *daddr, *saddr;
67ba4152 2141 struct rt6_info *rt6 = (struct rt6_info *)dst;
1da177e4 2142
45e4fd26
MKL
2143 if (rt6->rt6i_flags & RTF_LOCAL)
2144 return;
81aded24 2145
19bda36c
XL
2146 if (dst_metric_locked(dst, RTAX_MTU))
2147 return;
2148
0dec879f
JA
2149 if (iph) {
2150 daddr = &iph->daddr;
2151 saddr = &iph->saddr;
2152 } else if (sk) {
2153 daddr = &sk->sk_v6_daddr;
2154 saddr = &inet6_sk(sk)->saddr;
2155 } else {
2156 daddr = NULL;
2157 saddr = NULL;
2158 }
2159 dst_confirm_neigh(dst, daddr);
45e4fd26
MKL
2160 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2161 if (mtu >= dst_mtu(dst))
2162 return;
9d289715 2163
0d3f6d29 2164 if (!rt6_cache_allowed_for_pmtu(rt6)) {
45e4fd26 2165 rt6_do_update_pmtu(rt6, mtu);
2b760fcf
WW
2166 /* update rt6_ex->stamp for cache */
2167 if (rt6->rt6i_flags & RTF_CACHE)
2168 rt6_update_exception_stamp_rt(rt6);
0dec879f 2169 } else if (daddr) {
45e4fd26
MKL
2170 struct rt6_info *nrt6;
2171
45e4fd26
MKL
2172 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2173 if (nrt6) {
2174 rt6_do_update_pmtu(nrt6, mtu);
2b760fcf
WW
2175 if (rt6_insert_exception(nrt6, rt6))
2176 dst_release_immediate(&nrt6->dst);
45e4fd26 2177 }
1da177e4
LT
2178 }
2179}
2180
45e4fd26
MKL
2181static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2182 struct sk_buff *skb, u32 mtu)
2183{
2184 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2185}
2186
42ae66c8 2187void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
e2d118a1 2188 int oif, u32 mark, kuid_t uid)
81aded24
DM
2189{
2190 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2191 struct dst_entry *dst;
2192 struct flowi6 fl6;
2193
2194 memset(&fl6, 0, sizeof(fl6));
2195 fl6.flowi6_oif = oif;
1b3c61dc 2196 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
81aded24
DM
2197 fl6.daddr = iph->daddr;
2198 fl6.saddr = iph->saddr;
6502ca52 2199 fl6.flowlabel = ip6_flowinfo(iph);
e2d118a1 2200 fl6.flowi6_uid = uid;
81aded24
DM
2201
2202 dst = ip6_route_output(net, NULL, &fl6);
2203 if (!dst->error)
45e4fd26 2204 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
81aded24
DM
2205 dst_release(dst);
2206}
2207EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2208
2209void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2210{
33c162a9
MKL
2211 struct dst_entry *dst;
2212
81aded24 2213 ip6_update_pmtu(skb, sock_net(sk), mtu,
e2d118a1 2214 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
33c162a9
MKL
2215
2216 dst = __sk_dst_get(sk);
2217 if (!dst || !dst->obsolete ||
2218 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2219 return;
2220
2221 bh_lock_sock(sk);
2222 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2223 ip6_datagram_dst_update(sk, false);
2224 bh_unlock_sock(sk);
81aded24
DM
2225}
2226EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2227
b55b76b2
DJ
2228/* Handle redirects */
2229struct ip6rd_flowi {
2230 struct flowi6 fl6;
2231 struct in6_addr gateway;
2232};
2233
2234static struct rt6_info *__ip6_route_redirect(struct net *net,
2235 struct fib6_table *table,
2236 struct flowi6 *fl6,
b75cc8f9 2237 const struct sk_buff *skb,
b55b76b2
DJ
2238 int flags)
2239{
2240 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2b760fcf 2241 struct rt6_info *rt, *rt_cache;
b55b76b2
DJ
2242 struct fib6_node *fn;
2243
2244 /* Get the "current" route for this destination and
67c408cf 2245 * check if the redirect has come from appropriate router.
b55b76b2
DJ
2246 *
2247 * RFC 4861 specifies that redirects should only be
2248 * accepted if they come from the nexthop to the target.
2249 * Due to the way the routes are chosen, this notion
2250 * is a bit fuzzy and one might need to check all possible
2251 * routes.
2252 */
2253
66f5d6ce 2254 rcu_read_lock();
b55b76b2
DJ
2255 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2256restart:
66f5d6ce 2257 for_each_fib6_node_rt_rcu(fn) {
8067bb8c
IS
2258 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2259 continue;
b55b76b2
DJ
2260 if (rt6_check_expired(rt))
2261 continue;
2262 if (rt->dst.error)
2263 break;
2264 if (!(rt->rt6i_flags & RTF_GATEWAY))
2265 continue;
2266 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2267 continue;
2b760fcf
WW
2268 /* rt_cache's gateway might be different from its 'parent'
2269 * in the case of an ip redirect.
2270 * So we keep searching in the exception table if the gateway
2271 * is different.
2272 */
2273 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2274 rt_cache = rt6_find_cached_rt(rt,
2275 &fl6->daddr,
2276 &fl6->saddr);
2277 if (rt_cache &&
2278 ipv6_addr_equal(&rdfl->gateway,
2279 &rt_cache->rt6i_gateway)) {
2280 rt = rt_cache;
2281 break;
2282 }
b55b76b2 2283 continue;
2b760fcf 2284 }
b55b76b2
DJ
2285 break;
2286 }
2287
2288 if (!rt)
2289 rt = net->ipv6.ip6_null_entry;
2290 else if (rt->dst.error) {
2291 rt = net->ipv6.ip6_null_entry;
b0a1ba59
MKL
2292 goto out;
2293 }
2294
2295 if (rt == net->ipv6.ip6_null_entry) {
a3c00e46
MKL
2296 fn = fib6_backtrack(fn, &fl6->saddr);
2297 if (fn)
2298 goto restart;
b55b76b2 2299 }
a3c00e46 2300
b0a1ba59 2301out:
d3843fe5 2302 ip6_hold_safe(net, &rt, true);
b55b76b2 2303
66f5d6ce 2304 rcu_read_unlock();
b55b76b2 2305
b65f164d 2306 trace_fib6_table_lookup(net, rt, table, fl6);
b55b76b2
DJ
2307 return rt;
2308};
2309
2310static struct dst_entry *ip6_route_redirect(struct net *net,
b75cc8f9
DA
2311 const struct flowi6 *fl6,
2312 const struct sk_buff *skb,
2313 const struct in6_addr *gateway)
b55b76b2
DJ
2314{
2315 int flags = RT6_LOOKUP_F_HAS_SADDR;
2316 struct ip6rd_flowi rdfl;
2317
2318 rdfl.fl6 = *fl6;
2319 rdfl.gateway = *gateway;
2320
b75cc8f9 2321 return fib6_rule_lookup(net, &rdfl.fl6, skb,
b55b76b2
DJ
2322 flags, __ip6_route_redirect);
2323}
2324
e2d118a1
LC
2325void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2326 kuid_t uid)
3a5ad2ee
DM
2327{
2328 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2329 struct dst_entry *dst;
2330 struct flowi6 fl6;
2331
2332 memset(&fl6, 0, sizeof(fl6));
e374c618 2333 fl6.flowi6_iif = LOOPBACK_IFINDEX;
3a5ad2ee
DM
2334 fl6.flowi6_oif = oif;
2335 fl6.flowi6_mark = mark;
3a5ad2ee
DM
2336 fl6.daddr = iph->daddr;
2337 fl6.saddr = iph->saddr;
6502ca52 2338 fl6.flowlabel = ip6_flowinfo(iph);
e2d118a1 2339 fl6.flowi6_uid = uid;
3a5ad2ee 2340
b75cc8f9 2341 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
b55b76b2 2342 rt6_do_redirect(dst, NULL, skb);
3a5ad2ee
DM
2343 dst_release(dst);
2344}
2345EXPORT_SYMBOL_GPL(ip6_redirect);
2346
c92a59ec
DJ
2347void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2348 u32 mark)
2349{
2350 const struct ipv6hdr *iph = ipv6_hdr(skb);
2351 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2352 struct dst_entry *dst;
2353 struct flowi6 fl6;
2354
2355 memset(&fl6, 0, sizeof(fl6));
e374c618 2356 fl6.flowi6_iif = LOOPBACK_IFINDEX;
c92a59ec
DJ
2357 fl6.flowi6_oif = oif;
2358 fl6.flowi6_mark = mark;
c92a59ec
DJ
2359 fl6.daddr = msg->dest;
2360 fl6.saddr = iph->daddr;
e2d118a1 2361 fl6.flowi6_uid = sock_net_uid(net, NULL);
c92a59ec 2362
b75cc8f9 2363 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
b55b76b2 2364 rt6_do_redirect(dst, NULL, skb);
c92a59ec
DJ
2365 dst_release(dst);
2366}
2367
3a5ad2ee
DM
2368void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2369{
e2d118a1
LC
2370 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2371 sk->sk_uid);
3a5ad2ee
DM
2372}
2373EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2374
0dbaee3b 2375static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1da177e4 2376{
0dbaee3b
DM
2377 struct net_device *dev = dst->dev;
2378 unsigned int mtu = dst_mtu(dst);
2379 struct net *net = dev_net(dev);
2380
1da177e4
LT
2381 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2382
5578689a
DL
2383 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2384 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1da177e4
LT
2385
2386 /*
1ab1457c
YH
2387 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2388 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2389 * IPV6_MAXPLEN is also valid and means: "any MSS,
1da177e4
LT
2390 * rely only on pmtu discovery"
2391 */
2392 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2393 mtu = IPV6_MAXPLEN;
2394 return mtu;
2395}
2396
ebb762f2 2397static unsigned int ip6_mtu(const struct dst_entry *dst)
d33e4553 2398{
4b32b5ad
MKL
2399 const struct rt6_info *rt = (const struct rt6_info *)dst;
2400 unsigned int mtu = rt->rt6i_pmtu;
d33e4553 2401 struct inet6_dev *idev;
618f9bc7 2402
4b32b5ad
MKL
2403 if (mtu)
2404 goto out;
2405
2406 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 2407 if (mtu)
30f78d8e 2408 goto out;
618f9bc7
SK
2409
2410 mtu = IPV6_MIN_MTU;
d33e4553
DM
2411
2412 rcu_read_lock();
2413 idev = __in6_dev_get(dst->dev);
2414 if (idev)
2415 mtu = idev->cnf.mtu6;
2416 rcu_read_unlock();
2417
30f78d8e 2418out:
14972cbd
RP
2419 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2420
2421 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
d33e4553
DM
2422}
2423
3b00944c 2424struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
87a11578 2425 struct flowi6 *fl6)
1da177e4 2426{
87a11578 2427 struct dst_entry *dst;
1da177e4
LT
2428 struct rt6_info *rt;
2429 struct inet6_dev *idev = in6_dev_get(dev);
c346dca1 2430 struct net *net = dev_net(dev);
1da177e4 2431
38308473 2432 if (unlikely(!idev))
122bdf67 2433 return ERR_PTR(-ENODEV);
1da177e4 2434
ad706862 2435 rt = ip6_dst_alloc(net, dev, 0);
38308473 2436 if (unlikely(!rt)) {
1da177e4 2437 in6_dev_put(idev);
87a11578 2438 dst = ERR_PTR(-ENOMEM);
1da177e4
LT
2439 goto out;
2440 }
2441
8e2ec639 2442 rt->dst.flags |= DST_HOST;
588753f1 2443 rt->dst.input = ip6_input;
8e2ec639 2444 rt->dst.output = ip6_output;
550bab42 2445 rt->rt6i_gateway = fl6->daddr;
87a11578 2446 rt->rt6i_dst.addr = fl6->daddr;
8e2ec639
YZ
2447 rt->rt6i_dst.plen = 128;
2448 rt->rt6i_idev = idev;
14edd87d 2449 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1da177e4 2450
4c981e28 2451 /* Add this dst into uncached_list so that rt6_disable_ip() can
587fea74
WW
2452 * do proper release of the net_device
2453 */
2454 rt6_uncached_list_add(rt);
81eb8447 2455 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1da177e4 2456
87a11578
DM
2457 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2458
1da177e4 2459out:
87a11578 2460 return dst;
1da177e4
LT
2461}
2462
569d3645 2463static int ip6_dst_gc(struct dst_ops *ops)
1da177e4 2464{
86393e52 2465 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
7019b78e
DL
2466 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2467 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2468 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2469 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2470 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
fc66f95c 2471 int entries;
7019b78e 2472
fc66f95c 2473 entries = dst_entries_get_fast(ops);
49a18d86 2474 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
fc66f95c 2475 entries <= rt_max_size)
1da177e4
LT
2476 goto out;
2477
6891a346 2478 net->ipv6.ip6_rt_gc_expire++;
14956643 2479 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
fc66f95c
ED
2480 entries = dst_entries_get_slow(ops);
2481 if (entries < ops->gc_thresh)
7019b78e 2482 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1da177e4 2483out:
7019b78e 2484 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
fc66f95c 2485 return entries > rt_max_size;
1da177e4
LT
2486}
2487
e715b6d3
FW
2488static int ip6_convert_metrics(struct mx6_config *mxc,
2489 const struct fib6_config *cfg)
2490{
6670e152 2491 struct net *net = cfg->fc_nlinfo.nl_net;
c3a8d947 2492 bool ecn_ca = false;
e715b6d3
FW
2493 struct nlattr *nla;
2494 int remaining;
2495 u32 *mp;
2496
63159f29 2497 if (!cfg->fc_mx)
e715b6d3
FW
2498 return 0;
2499
2500 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2501 if (unlikely(!mp))
2502 return -ENOMEM;
2503
2504 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2505 int type = nla_type(nla);
1bb14807 2506 u32 val;
e715b6d3 2507
1bb14807
DB
2508 if (!type)
2509 continue;
2510 if (unlikely(type > RTAX_MAX))
2511 goto err;
ea697639 2512
1bb14807
DB
2513 if (type == RTAX_CC_ALGO) {
2514 char tmp[TCP_CA_NAME_MAX];
e715b6d3 2515
1bb14807 2516 nla_strlcpy(tmp, nla, sizeof(tmp));
6670e152 2517 val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
1bb14807
DB
2518 if (val == TCP_CA_UNSPEC)
2519 goto err;
2520 } else {
2521 val = nla_get_u32(nla);
e715b6d3 2522 }
626abd59
PA
2523 if (type == RTAX_HOPLIMIT && val > 255)
2524 val = 255;
b8d3e416
DB
2525 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2526 goto err;
1bb14807
DB
2527
2528 mp[type - 1] = val;
2529 __set_bit(type - 1, mxc->mx_valid);
e715b6d3
FW
2530 }
2531
c3a8d947
DB
2532 if (ecn_ca) {
2533 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2534 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2535 }
e715b6d3 2536
c3a8d947 2537 mxc->mx = mp;
e715b6d3
FW
2538 return 0;
2539 err:
2540 kfree(mp);
2541 return -EINVAL;
2542}
1da177e4 2543
8c14586f
DA
2544static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2545 struct fib6_config *cfg,
f4797b33
DA
2546 const struct in6_addr *gw_addr,
2547 u32 tbid, int flags)
8c14586f
DA
2548{
2549 struct flowi6 fl6 = {
2550 .flowi6_oif = cfg->fc_ifindex,
2551 .daddr = *gw_addr,
2552 .saddr = cfg->fc_prefsrc,
2553 };
2554 struct fib6_table *table;
2555 struct rt6_info *rt;
8c14586f 2556
f4797b33 2557 table = fib6_get_table(net, tbid);
8c14586f
DA
2558 if (!table)
2559 return NULL;
2560
2561 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2562 flags |= RT6_LOOKUP_F_HAS_SADDR;
2563
f4797b33 2564 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
b75cc8f9 2565 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
8c14586f
DA
2566
2567 /* if table lookup failed, fall back to full lookup */
2568 if (rt == net->ipv6.ip6_null_entry) {
2569 ip6_rt_put(rt);
2570 rt = NULL;
2571 }
2572
2573 return rt;
2574}
2575
fc1e64e1
DA
2576static int ip6_route_check_nh_onlink(struct net *net,
2577 struct fib6_config *cfg,
9fbb704c 2578 const struct net_device *dev,
fc1e64e1
DA
2579 struct netlink_ext_ack *extack)
2580{
44750f84 2581 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
fc1e64e1
DA
2582 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2583 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2584 struct rt6_info *grt;
2585 int err;
2586
2587 err = 0;
2588 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2589 if (grt) {
58e354c0
DA
2590 if (!grt->dst.error &&
2591 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
44750f84
DA
2592 NL_SET_ERR_MSG(extack,
2593 "Nexthop has invalid gateway or device mismatch");
fc1e64e1
DA
2594 err = -EINVAL;
2595 }
2596
2597 ip6_rt_put(grt);
2598 }
2599
2600 return err;
2601}
2602
1edce99f
DA
2603static int ip6_route_check_nh(struct net *net,
2604 struct fib6_config *cfg,
2605 struct net_device **_dev,
2606 struct inet6_dev **idev)
2607{
2608 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2609 struct net_device *dev = _dev ? *_dev : NULL;
2610 struct rt6_info *grt = NULL;
2611 int err = -EHOSTUNREACH;
2612
2613 if (cfg->fc_table) {
f4797b33
DA
2614 int flags = RT6_LOOKUP_F_IFACE;
2615
2616 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2617 cfg->fc_table, flags);
1edce99f
DA
2618 if (grt) {
2619 if (grt->rt6i_flags & RTF_GATEWAY ||
2620 (dev && dev != grt->dst.dev)) {
2621 ip6_rt_put(grt);
2622 grt = NULL;
2623 }
2624 }
2625 }
2626
2627 if (!grt)
b75cc8f9 2628 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
1edce99f
DA
2629
2630 if (!grt)
2631 goto out;
2632
2633 if (dev) {
2634 if (dev != grt->dst.dev) {
2635 ip6_rt_put(grt);
2636 goto out;
2637 }
2638 } else {
2639 *_dev = dev = grt->dst.dev;
2640 *idev = grt->rt6i_idev;
2641 dev_hold(dev);
2642 in6_dev_hold(grt->rt6i_idev);
2643 }
2644
2645 if (!(grt->rt6i_flags & RTF_GATEWAY))
2646 err = 0;
2647
2648 ip6_rt_put(grt);
2649
2650out:
2651 return err;
2652}
2653
9fbb704c
DA
2654static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2655 struct net_device **_dev, struct inet6_dev **idev,
2656 struct netlink_ext_ack *extack)
2657{
2658 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2659 int gwa_type = ipv6_addr_type(gw_addr);
232378e8 2660 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
9fbb704c 2661 const struct net_device *dev = *_dev;
232378e8 2662 bool need_addr_check = !dev;
9fbb704c
DA
2663 int err = -EINVAL;
2664
2665 /* if gw_addr is local we will fail to detect this in case
2666 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2667 * will return already-added prefix route via interface that
2668 * prefix route was assigned to, which might be non-loopback.
2669 */
232378e8
DA
2670 if (dev &&
2671 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2672 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
9fbb704c
DA
2673 goto out;
2674 }
2675
2676 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2677 /* IPv6 strictly inhibits using not link-local
2678 * addresses as nexthop address.
2679 * Otherwise, router will not able to send redirects.
2680 * It is very good, but in some (rare!) circumstances
2681 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2682 * some exceptions. --ANK
2683 * We allow IPv4-mapped nexthops to support RFC4798-type
2684 * addressing
2685 */
2686 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2687 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2688 goto out;
2689 }
2690
2691 if (cfg->fc_flags & RTNH_F_ONLINK)
2692 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2693 else
2694 err = ip6_route_check_nh(net, cfg, _dev, idev);
2695
2696 if (err)
2697 goto out;
2698 }
2699
2700 /* reload in case device was changed */
2701 dev = *_dev;
2702
2703 err = -EINVAL;
2704 if (!dev) {
2705 NL_SET_ERR_MSG(extack, "Egress device not specified");
2706 goto out;
2707 } else if (dev->flags & IFF_LOOPBACK) {
2708 NL_SET_ERR_MSG(extack,
2709 "Egress device can not be loopback device for this route");
2710 goto out;
2711 }
232378e8
DA
2712
2713 /* if we did not check gw_addr above, do so now that the
2714 * egress device has been resolved.
2715 */
2716 if (need_addr_check &&
2717 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2718 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2719 goto out;
2720 }
2721
9fbb704c
DA
2722 err = 0;
2723out:
2724 return err;
2725}
2726
333c4301
DA
2727static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2728 struct netlink_ext_ack *extack)
1da177e4 2729{
5578689a 2730 struct net *net = cfg->fc_nlinfo.nl_net;
1da177e4
LT
2731 struct rt6_info *rt = NULL;
2732 struct net_device *dev = NULL;
2733 struct inet6_dev *idev = NULL;
c71099ac 2734 struct fib6_table *table;
1da177e4 2735 int addr_type;
8c5b83f0 2736 int err = -EINVAL;
1da177e4 2737
557c44be 2738 /* RTF_PCPU is an internal flag; can not be set by userspace */
d5d531cb
DA
2739 if (cfg->fc_flags & RTF_PCPU) {
2740 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
557c44be 2741 goto out;
d5d531cb 2742 }
557c44be 2743
2ea2352e
WW
2744 /* RTF_CACHE is an internal flag; can not be set by userspace */
2745 if (cfg->fc_flags & RTF_CACHE) {
2746 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2747 goto out;
2748 }
2749
d5d531cb
DA
2750 if (cfg->fc_dst_len > 128) {
2751 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2752 goto out;
2753 }
2754 if (cfg->fc_src_len > 128) {
2755 NL_SET_ERR_MSG(extack, "Invalid source address length");
8c5b83f0 2756 goto out;
d5d531cb 2757 }
1da177e4 2758#ifndef CONFIG_IPV6_SUBTREES
d5d531cb
DA
2759 if (cfg->fc_src_len) {
2760 NL_SET_ERR_MSG(extack,
2761 "Specifying source address requires IPV6_SUBTREES to be enabled");
8c5b83f0 2762 goto out;
d5d531cb 2763 }
1da177e4 2764#endif
86872cb5 2765 if (cfg->fc_ifindex) {
1da177e4 2766 err = -ENODEV;
5578689a 2767 dev = dev_get_by_index(net, cfg->fc_ifindex);
1da177e4
LT
2768 if (!dev)
2769 goto out;
2770 idev = in6_dev_get(dev);
2771 if (!idev)
2772 goto out;
2773 }
2774
86872cb5
TG
2775 if (cfg->fc_metric == 0)
2776 cfg->fc_metric = IP6_RT_PRIO_USER;
1da177e4 2777
fc1e64e1
DA
2778 if (cfg->fc_flags & RTNH_F_ONLINK) {
2779 if (!dev) {
2780 NL_SET_ERR_MSG(extack,
2781 "Nexthop device required for onlink");
2782 err = -ENODEV;
2783 goto out;
2784 }
2785
2786 if (!(dev->flags & IFF_UP)) {
2787 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2788 err = -ENETDOWN;
2789 goto out;
2790 }
2791 }
2792
d71314b4 2793 err = -ENOBUFS;
38308473
DM
2794 if (cfg->fc_nlinfo.nlh &&
2795 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
d71314b4 2796 table = fib6_get_table(net, cfg->fc_table);
38308473 2797 if (!table) {
f3213831 2798 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
d71314b4
MV
2799 table = fib6_new_table(net, cfg->fc_table);
2800 }
2801 } else {
2802 table = fib6_new_table(net, cfg->fc_table);
2803 }
38308473
DM
2804
2805 if (!table)
c71099ac 2806 goto out;
c71099ac 2807
ad706862
MKL
2808 rt = ip6_dst_alloc(net, NULL,
2809 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1da177e4 2810
38308473 2811 if (!rt) {
1da177e4
LT
2812 err = -ENOMEM;
2813 goto out;
2814 }
2815
1716a961
G
2816 if (cfg->fc_flags & RTF_EXPIRES)
2817 rt6_set_expires(rt, jiffies +
2818 clock_t_to_jiffies(cfg->fc_expires));
2819 else
2820 rt6_clean_expires(rt);
1da177e4 2821
86872cb5
TG
2822 if (cfg->fc_protocol == RTPROT_UNSPEC)
2823 cfg->fc_protocol = RTPROT_BOOT;
2824 rt->rt6i_protocol = cfg->fc_protocol;
2825
2826 addr_type = ipv6_addr_type(&cfg->fc_dst);
1da177e4
LT
2827
2828 if (addr_type & IPV6_ADDR_MULTICAST)
d8d1f30b 2829 rt->dst.input = ip6_mc_input;
ab79ad14
2830 else if (cfg->fc_flags & RTF_LOCAL)
2831 rt->dst.input = ip6_input;
1da177e4 2832 else
d8d1f30b 2833 rt->dst.input = ip6_forward;
1da177e4 2834
d8d1f30b 2835 rt->dst.output = ip6_output;
1da177e4 2836
19e42e45
RP
2837 if (cfg->fc_encap) {
2838 struct lwtunnel_state *lwtstate;
2839
30357d7d 2840 err = lwtunnel_build_state(cfg->fc_encap_type,
127eb7cd 2841 cfg->fc_encap, AF_INET6, cfg,
9ae28727 2842 &lwtstate, extack);
19e42e45
RP
2843 if (err)
2844 goto out;
61adedf3 2845 rt->dst.lwtstate = lwtstate_get(lwtstate);
9942895b 2846 lwtunnel_set_redirect(&rt->dst);
19e42e45
RP
2847 }
2848
86872cb5
TG
2849 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2850 rt->rt6i_dst.plen = cfg->fc_dst_len;
afc4eef8 2851 if (rt->rt6i_dst.plen == 128)
e5fd387a 2852 rt->dst.flags |= DST_HOST;
e5fd387a 2853
1da177e4 2854#ifdef CONFIG_IPV6_SUBTREES
86872cb5
TG
2855 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2856 rt->rt6i_src.plen = cfg->fc_src_len;
1da177e4
LT
2857#endif
2858
86872cb5 2859 rt->rt6i_metric = cfg->fc_metric;
398958ae 2860 rt->rt6i_nh_weight = 1;
1da177e4
LT
2861
2862 /* We cannot add true routes via loopback here,
2863 they would result in kernel looping; promote them to reject routes
2864 */
86872cb5 2865 if ((cfg->fc_flags & RTF_REJECT) ||
38308473
DM
2866 (dev && (dev->flags & IFF_LOOPBACK) &&
2867 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2868 !(cfg->fc_flags & RTF_LOCAL))) {
1da177e4 2869 /* hold loopback dev/idev if we haven't done so. */
5578689a 2870 if (dev != net->loopback_dev) {
1da177e4
LT
2871 if (dev) {
2872 dev_put(dev);
2873 in6_dev_put(idev);
2874 }
5578689a 2875 dev = net->loopback_dev;
1da177e4
LT
2876 dev_hold(dev);
2877 idev = in6_dev_get(dev);
2878 if (!idev) {
2879 err = -ENODEV;
2880 goto out;
2881 }
2882 }
1da177e4 2883 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
ef2c7d7b
ND
2884 switch (cfg->fc_type) {
2885 case RTN_BLACKHOLE:
2886 rt->dst.error = -EINVAL;
ede2059d 2887 rt->dst.output = dst_discard_out;
7150aede 2888 rt->dst.input = dst_discard;
ef2c7d7b
ND
2889 break;
2890 case RTN_PROHIBIT:
2891 rt->dst.error = -EACCES;
7150aede
K
2892 rt->dst.output = ip6_pkt_prohibit_out;
2893 rt->dst.input = ip6_pkt_prohibit;
ef2c7d7b 2894 break;
b4949ab2 2895 case RTN_THROW:
0315e382 2896 case RTN_UNREACHABLE:
ef2c7d7b 2897 default:
7150aede 2898 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
0315e382
NF
2899 : (cfg->fc_type == RTN_UNREACHABLE)
2900 ? -EHOSTUNREACH : -ENETUNREACH;
7150aede
K
2901 rt->dst.output = ip6_pkt_discard_out;
2902 rt->dst.input = ip6_pkt_discard;
ef2c7d7b
ND
2903 break;
2904 }
1da177e4
LT
2905 goto install_route;
2906 }
2907
86872cb5 2908 if (cfg->fc_flags & RTF_GATEWAY) {
9fbb704c
DA
2909 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2910 if (err)
48ed7b26 2911 goto out;
1da177e4 2912
9fbb704c 2913 rt->rt6i_gateway = cfg->fc_gateway;
1da177e4
LT
2914 }
2915
2916 err = -ENODEV;
38308473 2917 if (!dev)
1da177e4
LT
2918 goto out;
2919
955ec4cb
DA
2920 if (!(dev->flags & IFF_UP)) {
2921 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2922 err = -ENETDOWN;
2923 goto out;
2924 }
2925
c3968a85
DW
2926 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2927 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
d5d531cb 2928 NL_SET_ERR_MSG(extack, "Invalid source address");
c3968a85
DW
2929 err = -EINVAL;
2930 goto out;
2931 }
4e3fd7a0 2932 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
c3968a85
DW
2933 rt->rt6i_prefsrc.plen = 128;
2934 } else
2935 rt->rt6i_prefsrc.plen = 0;
2936
86872cb5 2937 rt->rt6i_flags = cfg->fc_flags;
1da177e4
LT
2938
2939install_route:
5609b80a
IS
2940 if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2941 !netif_carrier_ok(dev))
2942 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
fc1e64e1 2943 rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
d8d1f30b 2944 rt->dst.dev = dev;
1da177e4 2945 rt->rt6i_idev = idev;
c71099ac 2946 rt->rt6i_table = table;
63152fc0 2947
c346dca1 2948 cfg->fc_nlinfo.nl_net = dev_net(dev);
63152fc0 2949
8c5b83f0 2950 return rt;
6b9ea5a6
RP
2951out:
2952 if (dev)
2953 dev_put(dev);
2954 if (idev)
2955 in6_dev_put(idev);
587fea74
WW
2956 if (rt)
2957 dst_release_immediate(&rt->dst);
6b9ea5a6 2958
8c5b83f0 2959 return ERR_PTR(err);
6b9ea5a6
RP
2960}
2961
333c4301
DA
2962int ip6_route_add(struct fib6_config *cfg,
2963 struct netlink_ext_ack *extack)
6b9ea5a6
RP
2964{
2965 struct mx6_config mxc = { .mx = NULL, };
8c5b83f0 2966 struct rt6_info *rt;
6b9ea5a6
RP
2967 int err;
2968
333c4301 2969 rt = ip6_route_info_create(cfg, extack);
8c5b83f0
RP
2970 if (IS_ERR(rt)) {
2971 err = PTR_ERR(rt);
2972 rt = NULL;
6b9ea5a6 2973 goto out;
8c5b83f0 2974 }
6b9ea5a6 2975
e715b6d3
FW
2976 err = ip6_convert_metrics(&mxc, cfg);
2977 if (err)
2978 goto out;
1da177e4 2979
333c4301 2980 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
e715b6d3
FW
2981
2982 kfree(mxc.mx);
6b9ea5a6 2983
e715b6d3 2984 return err;
1da177e4 2985out:
587fea74
WW
2986 if (rt)
2987 dst_release_immediate(&rt->dst);
6b9ea5a6 2988
1da177e4
LT
2989 return err;
2990}
2991
86872cb5 2992static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1da177e4
LT
2993{
2994 int err;
c71099ac 2995 struct fib6_table *table;
d1918542 2996 struct net *net = dev_net(rt->dst.dev);
1da177e4 2997
a4c2fd7f 2998 if (rt == net->ipv6.ip6_null_entry) {
6825a26c
G
2999 err = -ENOENT;
3000 goto out;
3001 }
6c813a72 3002
c71099ac 3003 table = rt->rt6i_table;
66f5d6ce 3004 spin_lock_bh(&table->tb6_lock);
86872cb5 3005 err = fib6_del(rt, info);
66f5d6ce 3006 spin_unlock_bh(&table->tb6_lock);
1da177e4 3007
6825a26c 3008out:
94e187c0 3009 ip6_rt_put(rt);
1da177e4
LT
3010 return err;
3011}
3012
e0a1ad73
TG
3013int ip6_del_rt(struct rt6_info *rt)
3014{
4d1169c1 3015 struct nl_info info = {
d1918542 3016 .nl_net = dev_net(rt->dst.dev),
4d1169c1 3017 };
528c4ceb 3018 return __ip6_del_rt(rt, &info);
e0a1ad73
TG
3019}
3020
0ae81335
DA
3021static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
3022{
3023 struct nl_info *info = &cfg->fc_nlinfo;
e3330039 3024 struct net *net = info->nl_net;
16a16cd3 3025 struct sk_buff *skb = NULL;
0ae81335 3026 struct fib6_table *table;
e3330039 3027 int err = -ENOENT;
0ae81335 3028
e3330039
WC
3029 if (rt == net->ipv6.ip6_null_entry)
3030 goto out_put;
0ae81335 3031 table = rt->rt6i_table;
66f5d6ce 3032 spin_lock_bh(&table->tb6_lock);
0ae81335
DA
3033
3034 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
3035 struct rt6_info *sibling, *next_sibling;
3036
16a16cd3
DA
3037 /* prefer to send a single notification with all hops */
3038 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3039 if (skb) {
3040 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3041
e3330039 3042 if (rt6_fill_node(net, skb, rt,
16a16cd3
DA
3043 NULL, NULL, 0, RTM_DELROUTE,
3044 info->portid, seq, 0) < 0) {
3045 kfree_skb(skb);
3046 skb = NULL;
3047 } else
3048 info->skip_notify = 1;
3049 }
3050
0ae81335
DA
3051 list_for_each_entry_safe(sibling, next_sibling,
3052 &rt->rt6i_siblings,
3053 rt6i_siblings) {
3054 err = fib6_del(sibling, info);
3055 if (err)
e3330039 3056 goto out_unlock;
0ae81335
DA
3057 }
3058 }
3059
3060 err = fib6_del(rt, info);
e3330039 3061out_unlock:
66f5d6ce 3062 spin_unlock_bh(&table->tb6_lock);
e3330039 3063out_put:
0ae81335 3064 ip6_rt_put(rt);
16a16cd3
DA
3065
3066 if (skb) {
e3330039 3067 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
16a16cd3
DA
3068 info->nlh, gfp_any());
3069 }
0ae81335
DA
3070 return err;
3071}
3072
333c4301
DA
3073static int ip6_route_del(struct fib6_config *cfg,
3074 struct netlink_ext_ack *extack)
1da177e4 3075{
2b760fcf 3076 struct rt6_info *rt, *rt_cache;
c71099ac 3077 struct fib6_table *table;
1da177e4 3078 struct fib6_node *fn;
1da177e4
LT
3079 int err = -ESRCH;
3080
5578689a 3081 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
d5d531cb
DA
3082 if (!table) {
3083 NL_SET_ERR_MSG(extack, "FIB table does not exist");
c71099ac 3084 return err;
d5d531cb 3085 }
c71099ac 3086
66f5d6ce 3087 rcu_read_lock();
1da177e4 3088
c71099ac 3089 fn = fib6_locate(&table->tb6_root,
86872cb5 3090 &cfg->fc_dst, cfg->fc_dst_len,
38fbeeee 3091 &cfg->fc_src, cfg->fc_src_len,
2b760fcf 3092 !(cfg->fc_flags & RTF_CACHE));
1ab1457c 3093
1da177e4 3094 if (fn) {
66f5d6ce 3095 for_each_fib6_node_rt_rcu(fn) {
2b760fcf
WW
3096 if (cfg->fc_flags & RTF_CACHE) {
3097 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3098 &cfg->fc_src);
3099 if (!rt_cache)
3100 continue;
3101 rt = rt_cache;
3102 }
86872cb5 3103 if (cfg->fc_ifindex &&
d1918542
DM
3104 (!rt->dst.dev ||
3105 rt->dst.dev->ifindex != cfg->fc_ifindex))
1da177e4 3106 continue;
86872cb5
TG
3107 if (cfg->fc_flags & RTF_GATEWAY &&
3108 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1da177e4 3109 continue;
86872cb5 3110 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1da177e4 3111 continue;
c2ed1880
M
3112 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
3113 continue;
d3843fe5
WW
3114 if (!dst_hold_safe(&rt->dst))
3115 break;
66f5d6ce 3116 rcu_read_unlock();
1da177e4 3117
0ae81335
DA
3118 /* if gateway was specified only delete the one hop */
3119 if (cfg->fc_flags & RTF_GATEWAY)
3120 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3121
3122 return __ip6_del_rt_siblings(rt, cfg);
1da177e4
LT
3123 }
3124 }
66f5d6ce 3125 rcu_read_unlock();
1da177e4
LT
3126
3127 return err;
3128}
3129
6700c270 3130static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
a6279458 3131{
a6279458 3132 struct netevent_redirect netevent;
e8599ff4 3133 struct rt6_info *rt, *nrt = NULL;
e8599ff4
DM
3134 struct ndisc_options ndopts;
3135 struct inet6_dev *in6_dev;
3136 struct neighbour *neigh;
71bcdba0 3137 struct rd_msg *msg;
6e157b6a
DM
3138 int optlen, on_link;
3139 u8 *lladdr;
e8599ff4 3140
29a3cad5 3141 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
71bcdba0 3142 optlen -= sizeof(*msg);
e8599ff4
DM
3143
3144 if (optlen < 0) {
6e157b6a 3145 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
e8599ff4
DM
3146 return;
3147 }
3148
71bcdba0 3149 msg = (struct rd_msg *)icmp6_hdr(skb);
e8599ff4 3150
71bcdba0 3151 if (ipv6_addr_is_multicast(&msg->dest)) {
6e157b6a 3152 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
e8599ff4
DM
3153 return;
3154 }
3155
6e157b6a 3156 on_link = 0;
71bcdba0 3157 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
e8599ff4 3158 on_link = 1;
71bcdba0 3159 } else if (ipv6_addr_type(&msg->target) !=
e8599ff4 3160 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
6e157b6a 3161 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
e8599ff4
DM
3162 return;
3163 }
3164
3165 in6_dev = __in6_dev_get(skb->dev);
3166 if (!in6_dev)
3167 return;
3168 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3169 return;
3170
3171 /* RFC2461 8.1:
3172 * The IP source address of the Redirect MUST be the same as the current
3173 * first-hop router for the specified ICMP Destination Address.
3174 */
3175
f997c55c 3176 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
e8599ff4
DM
3177 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3178 return;
3179 }
6e157b6a
DM
3180
3181 lladdr = NULL;
e8599ff4
DM
3182 if (ndopts.nd_opts_tgt_lladdr) {
3183 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3184 skb->dev);
3185 if (!lladdr) {
3186 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3187 return;
3188 }
3189 }
3190
6e157b6a 3191 rt = (struct rt6_info *) dst;
ec13ad1d 3192 if (rt->rt6i_flags & RTF_REJECT) {
6e157b6a 3193 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
e8599ff4 3194 return;
6e157b6a 3195 }
e8599ff4 3196
6e157b6a
DM
3197 /* Redirect received -> path was valid.
3198 * Look, redirects are sent only in response to data packets,
3199 * so that this nexthop apparently is reachable. --ANK
3200 */
0dec879f 3201 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
a6279458 3202
71bcdba0 3203 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
6e157b6a
DM
3204 if (!neigh)
3205 return;
a6279458 3206
1da177e4
LT
3207 /*
3208 * We have finally decided to accept it.
3209 */
3210
f997c55c 3211 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
1da177e4
LT
3212 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3213 NEIGH_UPDATE_F_OVERRIDE|
3214 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
f997c55c
AA
3215 NEIGH_UPDATE_F_ISROUTER)),
3216 NDISC_REDIRECT, &ndopts);
1da177e4 3217
83a09abd 3218 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
38308473 3219 if (!nrt)
1da177e4
LT
3220 goto out;
3221
3222 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3223 if (on_link)
3224 nrt->rt6i_flags &= ~RTF_GATEWAY;
3225
b91d5329 3226 nrt->rt6i_protocol = RTPROT_REDIRECT;
4e3fd7a0 3227 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1da177e4 3228
2b760fcf
WW
3229 /* No need to remove rt from the exception table if rt is
3230 * a cached route because rt6_insert_exception() will
3231 * takes care of it
3232 */
3233 if (rt6_insert_exception(nrt, rt)) {
3234 dst_release_immediate(&nrt->dst);
3235 goto out;
3236 }
1da177e4 3237
d8d1f30b
CG
3238 netevent.old = &rt->dst;
3239 netevent.new = &nrt->dst;
71bcdba0 3240 netevent.daddr = &msg->dest;
60592833 3241 netevent.neigh = neigh;
8d71740c
TT
3242 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3243
1da177e4 3244out:
e8599ff4 3245 neigh_release(neigh);
6e157b6a
DM
3246}
3247
1da177e4
LT
3248/*
3249 * Misc support functions
3250 */
3251
4b32b5ad
MKL
3252static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3253{
3a2232e9 3254 BUG_ON(from->from);
4b32b5ad
MKL
3255
3256 rt->rt6i_flags &= ~RTF_EXPIRES;
3257 dst_hold(&from->dst);
3a2232e9 3258 rt->from = from;
4b32b5ad
MKL
3259 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3260}
3261
83a09abd
MKL
3262static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3263{
3264 rt->dst.input = ort->dst.input;
3265 rt->dst.output = ort->dst.output;
3266 rt->rt6i_dst = ort->rt6i_dst;
3267 rt->dst.error = ort->dst.error;
3268 rt->rt6i_idev = ort->rt6i_idev;
3269 if (rt->rt6i_idev)
3270 in6_dev_hold(rt->rt6i_idev);
3271 rt->dst.lastuse = jiffies;
3272 rt->rt6i_gateway = ort->rt6i_gateway;
3273 rt->rt6i_flags = ort->rt6i_flags;
3274 rt6_set_from(rt, ort);
3275 rt->rt6i_metric = ort->rt6i_metric;
1da177e4 3276#ifdef CONFIG_IPV6_SUBTREES
83a09abd 3277 rt->rt6i_src = ort->rt6i_src;
1da177e4 3278#endif
83a09abd
MKL
3279 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3280 rt->rt6i_table = ort->rt6i_table;
61adedf3 3281 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
1da177e4
LT
3282}
3283
70ceb4f5 3284#ifdef CONFIG_IPV6_ROUTE_INFO
efa2cea0 3285static struct rt6_info *rt6_get_route_info(struct net *net,
b71d1d42 3286 const struct in6_addr *prefix, int prefixlen,
830218c1
DA
3287 const struct in6_addr *gwaddr,
3288 struct net_device *dev)
70ceb4f5 3289{
830218c1
DA
3290 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3291 int ifindex = dev->ifindex;
70ceb4f5
YH
3292 struct fib6_node *fn;
3293 struct rt6_info *rt = NULL;
c71099ac
TG
3294 struct fib6_table *table;
3295
830218c1 3296 table = fib6_get_table(net, tb_id);
38308473 3297 if (!table)
c71099ac 3298 return NULL;
70ceb4f5 3299
66f5d6ce 3300 rcu_read_lock();
38fbeeee 3301 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
70ceb4f5
YH
3302 if (!fn)
3303 goto out;
3304
66f5d6ce 3305 for_each_fib6_node_rt_rcu(fn) {
d1918542 3306 if (rt->dst.dev->ifindex != ifindex)
70ceb4f5
YH
3307 continue;
3308 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3309 continue;
3310 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3311 continue;
d3843fe5 3312 ip6_hold_safe(NULL, &rt, false);
70ceb4f5
YH
3313 break;
3314 }
3315out:
66f5d6ce 3316 rcu_read_unlock();
70ceb4f5
YH
3317 return rt;
3318}
3319
efa2cea0 3320static struct rt6_info *rt6_add_route_info(struct net *net,
b71d1d42 3321 const struct in6_addr *prefix, int prefixlen,
830218c1
DA
3322 const struct in6_addr *gwaddr,
3323 struct net_device *dev,
95c96174 3324 unsigned int pref)
70ceb4f5 3325{
86872cb5 3326 struct fib6_config cfg = {
238fc7ea 3327 .fc_metric = IP6_RT_PRIO_USER,
830218c1 3328 .fc_ifindex = dev->ifindex,
86872cb5
TG
3329 .fc_dst_len = prefixlen,
3330 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3331 RTF_UP | RTF_PREF(pref),
b91d5329 3332 .fc_protocol = RTPROT_RA,
15e47304 3333 .fc_nlinfo.portid = 0,
efa2cea0
DL
3334 .fc_nlinfo.nlh = NULL,
3335 .fc_nlinfo.nl_net = net,
86872cb5
TG
3336 };
3337
830218c1 3338 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
4e3fd7a0
AD
3339 cfg.fc_dst = *prefix;
3340 cfg.fc_gateway = *gwaddr;
70ceb4f5 3341
e317da96
YH
3342 /* We should treat it as a default route if prefix length is 0. */
3343 if (!prefixlen)
86872cb5 3344 cfg.fc_flags |= RTF_DEFAULT;
70ceb4f5 3345
333c4301 3346 ip6_route_add(&cfg, NULL);
70ceb4f5 3347
830218c1 3348 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
70ceb4f5
YH
3349}
3350#endif
3351
b71d1d42 3352struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1ab1457c 3353{
830218c1 3354 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
1da177e4 3355 struct rt6_info *rt;
c71099ac 3356 struct fib6_table *table;
1da177e4 3357
830218c1 3358 table = fib6_get_table(dev_net(dev), tb_id);
38308473 3359 if (!table)
c71099ac 3360 return NULL;
1da177e4 3361
66f5d6ce
WW
3362 rcu_read_lock();
3363 for_each_fib6_node_rt_rcu(&table->tb6_root) {
d1918542 3364 if (dev == rt->dst.dev &&
045927ff 3365 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1da177e4
LT
3366 ipv6_addr_equal(&rt->rt6i_gateway, addr))
3367 break;
3368 }
3369 if (rt)
d3843fe5 3370 ip6_hold_safe(NULL, &rt, false);
66f5d6ce 3371 rcu_read_unlock();
1da177e4
LT
3372 return rt;
3373}
3374
b71d1d42 3375struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
ebacaaa0
YH
3376 struct net_device *dev,
3377 unsigned int pref)
1da177e4 3378{
86872cb5 3379 struct fib6_config cfg = {
ca254490 3380 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
238fc7ea 3381 .fc_metric = IP6_RT_PRIO_USER,
86872cb5
TG
3382 .fc_ifindex = dev->ifindex,
3383 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3384 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
b91d5329 3385 .fc_protocol = RTPROT_RA,
15e47304 3386 .fc_nlinfo.portid = 0,
5578689a 3387 .fc_nlinfo.nlh = NULL,
c346dca1 3388 .fc_nlinfo.nl_net = dev_net(dev),
86872cb5 3389 };
1da177e4 3390
4e3fd7a0 3391 cfg.fc_gateway = *gwaddr;
1da177e4 3392
333c4301 3393 if (!ip6_route_add(&cfg, NULL)) {
830218c1
DA
3394 struct fib6_table *table;
3395
3396 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3397 if (table)
3398 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3399 }
1da177e4 3400
1da177e4
LT
3401 return rt6_get_dflt_router(gwaddr, dev);
3402}
3403
830218c1 3404static void __rt6_purge_dflt_routers(struct fib6_table *table)
1da177e4
LT
3405{
3406 struct rt6_info *rt;
3407
3408restart:
66f5d6ce
WW
3409 rcu_read_lock();
3410 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3e8b0ac3
LC
3411 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3412 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
d3843fe5 3413 if (dst_hold_safe(&rt->dst)) {
66f5d6ce 3414 rcu_read_unlock();
d3843fe5
WW
3415 ip6_del_rt(rt);
3416 } else {
66f5d6ce 3417 rcu_read_unlock();
d3843fe5 3418 }
1da177e4
LT
3419 goto restart;
3420 }
3421 }
66f5d6ce 3422 rcu_read_unlock();
830218c1
DA
3423
3424 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3425}
3426
3427void rt6_purge_dflt_routers(struct net *net)
3428{
3429 struct fib6_table *table;
3430 struct hlist_head *head;
3431 unsigned int h;
3432
3433 rcu_read_lock();
3434
3435 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3436 head = &net->ipv6.fib_table_hash[h];
3437 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3438 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3439 __rt6_purge_dflt_routers(table);
3440 }
3441 }
3442
3443 rcu_read_unlock();
1da177e4
LT
3444}
3445
5578689a
DL
3446static void rtmsg_to_fib6_config(struct net *net,
3447 struct in6_rtmsg *rtmsg,
86872cb5
TG
3448 struct fib6_config *cfg)
3449{
3450 memset(cfg, 0, sizeof(*cfg));
3451
ca254490
DA
3452 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3453 : RT6_TABLE_MAIN;
86872cb5
TG
3454 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3455 cfg->fc_metric = rtmsg->rtmsg_metric;
3456 cfg->fc_expires = rtmsg->rtmsg_info;
3457 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3458 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3459 cfg->fc_flags = rtmsg->rtmsg_flags;
3460
5578689a 3461 cfg->fc_nlinfo.nl_net = net;
f1243c2d 3462
4e3fd7a0
AD
3463 cfg->fc_dst = rtmsg->rtmsg_dst;
3464 cfg->fc_src = rtmsg->rtmsg_src;
3465 cfg->fc_gateway = rtmsg->rtmsg_gateway;
86872cb5
TG
3466}
3467
5578689a 3468int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1da177e4 3469{
86872cb5 3470 struct fib6_config cfg;
1da177e4
LT
3471 struct in6_rtmsg rtmsg;
3472 int err;
3473
67ba4152 3474 switch (cmd) {
1da177e4
LT
3475 case SIOCADDRT: /* Add a route */
3476 case SIOCDELRT: /* Delete a route */
af31f412 3477 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
1da177e4
LT
3478 return -EPERM;
3479 err = copy_from_user(&rtmsg, arg,
3480 sizeof(struct in6_rtmsg));
3481 if (err)
3482 return -EFAULT;
86872cb5 3483
5578689a 3484 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
86872cb5 3485
1da177e4
LT
3486 rtnl_lock();
3487 switch (cmd) {
3488 case SIOCADDRT:
333c4301 3489 err = ip6_route_add(&cfg, NULL);
1da177e4
LT
3490 break;
3491 case SIOCDELRT:
333c4301 3492 err = ip6_route_del(&cfg, NULL);
1da177e4
LT
3493 break;
3494 default:
3495 err = -EINVAL;
3496 }
3497 rtnl_unlock();
3498
3499 return err;
3ff50b79 3500 }
1da177e4
LT
3501
3502 return -EINVAL;
3503}
3504
3505/*
3506 * Drop the packet on the floor
3507 */
3508
d5fdd6ba 3509static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1da177e4 3510{
612f09e8 3511 int type;
adf30907 3512 struct dst_entry *dst = skb_dst(skb);
612f09e8
YH
3513 switch (ipstats_mib_noroutes) {
3514 case IPSTATS_MIB_INNOROUTES:
0660e03f 3515 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
45bb0060 3516 if (type == IPV6_ADDR_ANY) {
3bd653c8
DL
3517 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3518 IPSTATS_MIB_INADDRERRORS);
612f09e8
YH
3519 break;
3520 }
3521 /* FALLTHROUGH */
3522 case IPSTATS_MIB_OUTNOROUTES:
3bd653c8
DL
3523 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3524 ipstats_mib_noroutes);
612f09e8
YH
3525 break;
3526 }
3ffe533c 3527 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1da177e4
LT
3528 kfree_skb(skb);
3529 return 0;
3530}
3531
9ce8ade0
TG
3532static int ip6_pkt_discard(struct sk_buff *skb)
3533{
612f09e8 3534 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
9ce8ade0
TG
3535}
3536
ede2059d 3537static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 3538{
adf30907 3539 skb->dev = skb_dst(skb)->dev;
612f09e8 3540 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
3541}
3542
9ce8ade0
TG
3543static int ip6_pkt_prohibit(struct sk_buff *skb)
3544{
612f09e8 3545 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
9ce8ade0
TG
3546}
3547
ede2059d 3548static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
9ce8ade0 3549{
adf30907 3550 skb->dev = skb_dst(skb)->dev;
612f09e8 3551 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
9ce8ade0
TG
3552}
3553
1da177e4
LT
3554/*
3555 * Allocate a dst for local (unicast / anycast) address.
3556 */
3557
3558struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3559 const struct in6_addr *addr,
8f031519 3560 bool anycast)
1da177e4 3561{
ca254490 3562 u32 tb_id;
c346dca1 3563 struct net *net = dev_net(idev->dev);
4832c30d 3564 struct net_device *dev = idev->dev;
5f02ce24
DA
3565 struct rt6_info *rt;
3566
5f02ce24 3567 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
a3300ef4 3568 if (!rt)
1da177e4
LT
3569 return ERR_PTR(-ENOMEM);
3570
1da177e4
LT
3571 in6_dev_hold(idev);
3572
11d53b49 3573 rt->dst.flags |= DST_HOST;
d8d1f30b
CG
3574 rt->dst.input = ip6_input;
3575 rt->dst.output = ip6_output;
1da177e4 3576 rt->rt6i_idev = idev;
1da177e4 3577
94b5e0f9 3578 rt->rt6i_protocol = RTPROT_KERNEL;
1da177e4 3579 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
58c4fb86
YH
3580 if (anycast)
3581 rt->rt6i_flags |= RTF_ANYCAST;
3582 else
1da177e4 3583 rt->rt6i_flags |= RTF_LOCAL;
1da177e4 3584
550bab42 3585 rt->rt6i_gateway = *addr;
4e3fd7a0 3586 rt->rt6i_dst.addr = *addr;
1da177e4 3587 rt->rt6i_dst.plen = 128;
ca254490
DA
3588 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3589 rt->rt6i_table = fib6_get_table(net, tb_id);
1da177e4 3590
1da177e4
LT
3591 return rt;
3592}
3593
c3968a85
DW
3594/* remove deleted ip from prefsrc entries */
3595struct arg_dev_net_ip {
3596 struct net_device *dev;
3597 struct net *net;
3598 struct in6_addr *addr;
3599};
3600
3601static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3602{
3603 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3604 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3605 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3606
d1918542 3607 if (((void *)rt->dst.dev == dev || !dev) &&
c3968a85
DW
3608 rt != net->ipv6.ip6_null_entry &&
3609 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
60006a48 3610 spin_lock_bh(&rt6_exception_lock);
c3968a85
DW
3611 /* remove prefsrc entry */
3612 rt->rt6i_prefsrc.plen = 0;
60006a48
WW
3613 /* need to update cache as well */
3614 rt6_exceptions_remove_prefsrc(rt);
3615 spin_unlock_bh(&rt6_exception_lock);
c3968a85
DW
3616 }
3617 return 0;
3618}
3619
3620void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3621{
3622 struct net *net = dev_net(ifp->idev->dev);
3623 struct arg_dev_net_ip adni = {
3624 .dev = ifp->idev->dev,
3625 .net = net,
3626 .addr = &ifp->addr,
3627 };
0c3584d5 3628 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
c3968a85
DW
3629}
3630
be7a010d 3631#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
be7a010d
DJ
3632
3633/* Remove routers and update dst entries when gateway turn into host. */
3634static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3635{
3636 struct in6_addr *gateway = (struct in6_addr *)arg;
3637
2b760fcf
WW
3638 if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3639 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
be7a010d
DJ
3640 return -1;
3641 }
b16cb459
WW
3642
3643 /* Further clean up cached routes in exception table.
3644 * This is needed because cached route may have a different
3645 * gateway than its 'parent' in the case of an ip redirect.
3646 */
3647 rt6_exceptions_clean_tohost(rt, gateway);
3648
be7a010d
DJ
3649 return 0;
3650}
3651
3652void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3653{
3654 fib6_clean_all(net, fib6_clean_tohost, gateway);
3655}
3656
2127d95a
IS
3657struct arg_netdev_event {
3658 const struct net_device *dev;
4c981e28
IS
3659 union {
3660 unsigned int nh_flags;
3661 unsigned long event;
3662 };
2127d95a
IS
3663};
3664
d7dedee1
IS
3665static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3666{
3667 struct rt6_info *iter;
3668 struct fib6_node *fn;
3669
3670 fn = rcu_dereference_protected(rt->rt6i_node,
3671 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3672 iter = rcu_dereference_protected(fn->leaf,
3673 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3674 while (iter) {
3675 if (iter->rt6i_metric == rt->rt6i_metric &&
3676 rt6_qualify_for_ecmp(iter))
3677 return iter;
3678 iter = rcu_dereference_protected(iter->rt6_next,
3679 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3680 }
3681
3682 return NULL;
3683}
3684
3685static bool rt6_is_dead(const struct rt6_info *rt)
3686{
3687 if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3688 (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3689 rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3690 return true;
3691
3692 return false;
3693}
3694
3695static int rt6_multipath_total_weight(const struct rt6_info *rt)
3696{
3697 struct rt6_info *iter;
3698 int total = 0;
3699
3700 if (!rt6_is_dead(rt))
398958ae 3701 total += rt->rt6i_nh_weight;
d7dedee1
IS
3702
3703 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3704 if (!rt6_is_dead(iter))
398958ae 3705 total += iter->rt6i_nh_weight;
d7dedee1
IS
3706 }
3707
3708 return total;
3709}
3710
3711static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3712{
3713 int upper_bound = -1;
3714
3715 if (!rt6_is_dead(rt)) {
398958ae 3716 *weight += rt->rt6i_nh_weight;
d7dedee1
IS
3717 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3718 total) - 1;
3719 }
3720 atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3721}
3722
3723static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3724{
3725 struct rt6_info *iter;
3726 int weight = 0;
3727
3728 rt6_upper_bound_set(rt, &weight, total);
3729
3730 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3731 rt6_upper_bound_set(iter, &weight, total);
3732}
3733
3734void rt6_multipath_rebalance(struct rt6_info *rt)
3735{
3736 struct rt6_info *first;
3737 int total;
3738
3739 /* In case the entire multipath route was marked for flushing,
3740 * then there is no need to rebalance upon the removal of every
3741 * sibling route.
3742 */
3743 if (!rt->rt6i_nsiblings || rt->should_flush)
3744 return;
3745
3746 /* During lookup routes are evaluated in order, so we need to
3747 * make sure upper bounds are assigned from the first sibling
3748 * onwards.
3749 */
3750 first = rt6_multipath_first_sibling(rt);
3751 if (WARN_ON_ONCE(!first))
3752 return;
3753
3754 total = rt6_multipath_total_weight(first);
3755 rt6_multipath_upper_bound_set(first, total);
3756}
3757
2127d95a
IS
3758static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3759{
3760 const struct arg_netdev_event *arg = p_arg;
3761 const struct net *net = dev_net(arg->dev);
3762
1de178ed 3763 if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
2127d95a 3764 rt->rt6i_nh_flags &= ~arg->nh_flags;
1de178ed 3765 fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
d7dedee1 3766 rt6_multipath_rebalance(rt);
1de178ed 3767 }
2127d95a
IS
3768
3769 return 0;
3770}
3771
3772void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3773{
3774 struct arg_netdev_event arg = {
3775 .dev = dev,
6802f3ad
IS
3776 {
3777 .nh_flags = nh_flags,
3778 },
2127d95a
IS
3779 };
3780
3781 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3782 arg.nh_flags |= RTNH_F_LINKDOWN;
3783
3784 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3785}
3786
1de178ed
IS
3787static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3788 const struct net_device *dev)
3789{
3790 struct rt6_info *iter;
3791
3792 if (rt->dst.dev == dev)
3793 return true;
3794 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3795 if (iter->dst.dev == dev)
3796 return true;
3797
3798 return false;
3799}
3800
3801static void rt6_multipath_flush(struct rt6_info *rt)
3802{
3803 struct rt6_info *iter;
3804
3805 rt->should_flush = 1;
3806 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3807 iter->should_flush = 1;
3808}
3809
3810static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3811 const struct net_device *down_dev)
3812{
3813 struct rt6_info *iter;
3814 unsigned int dead = 0;
3815
3816 if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3817 dead++;
3818 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3819 if (iter->dst.dev == down_dev ||
3820 iter->rt6i_nh_flags & RTNH_F_DEAD)
3821 dead++;
3822
3823 return dead;
3824}
3825
3826static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3827 const struct net_device *dev,
3828 unsigned int nh_flags)
3829{
3830 struct rt6_info *iter;
3831
3832 if (rt->dst.dev == dev)
3833 rt->rt6i_nh_flags |= nh_flags;
3834 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3835 if (iter->dst.dev == dev)
3836 iter->rt6i_nh_flags |= nh_flags;
3837}
3838
a1a22c12 3839/* called with write lock held for table with rt */
4c981e28 3840static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
1da177e4 3841{
4c981e28
IS
3842 const struct arg_netdev_event *arg = p_arg;
3843 const struct net_device *dev = arg->dev;
3844 const struct net *net = dev_net(dev);
8ed67789 3845
1de178ed 3846 if (rt == net->ipv6.ip6_null_entry)
27c6fa73
IS
3847 return 0;
3848
3849 switch (arg->event) {
3850 case NETDEV_UNREGISTER:
1de178ed 3851 return rt->dst.dev == dev ? -1 : 0;
27c6fa73 3852 case NETDEV_DOWN:
1de178ed 3853 if (rt->should_flush)
27c6fa73 3854 return -1;
1de178ed
IS
3855 if (!rt->rt6i_nsiblings)
3856 return rt->dst.dev == dev ? -1 : 0;
3857 if (rt6_multipath_uses_dev(rt, dev)) {
3858 unsigned int count;
3859
3860 count = rt6_multipath_dead_count(rt, dev);
3861 if (rt->rt6i_nsiblings + 1 == count) {
3862 rt6_multipath_flush(rt);
3863 return -1;
3864 }
3865 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3866 RTNH_F_LINKDOWN);
3867 fib6_update_sernum(rt);
d7dedee1 3868 rt6_multipath_rebalance(rt);
1de178ed
IS
3869 }
3870 return -2;
27c6fa73 3871 case NETDEV_CHANGE:
1de178ed
IS
3872 if (rt->dst.dev != dev ||
3873 rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
27c6fa73
IS
3874 break;
3875 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
d7dedee1 3876 rt6_multipath_rebalance(rt);
27c6fa73 3877 break;
2b241361 3878 }
c159d30c 3879
1da177e4
LT
3880 return 0;
3881}
3882
27c6fa73 3883void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
1da177e4 3884{
4c981e28 3885 struct arg_netdev_event arg = {
8ed67789 3886 .dev = dev,
6802f3ad
IS
3887 {
3888 .event = event,
3889 },
8ed67789
DL
3890 };
3891
4c981e28
IS
3892 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3893}
3894
3895void rt6_disable_ip(struct net_device *dev, unsigned long event)
3896{
3897 rt6_sync_down_dev(dev, event);
3898 rt6_uncached_list_flush_dev(dev_net(dev), dev);
3899 neigh_ifdown(&nd_tbl, dev);
1da177e4
LT
3900}
3901
95c96174 3902struct rt6_mtu_change_arg {
1da177e4 3903 struct net_device *dev;
95c96174 3904 unsigned int mtu;
1da177e4
LT
3905};
3906
3907static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3908{
3909 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3910 struct inet6_dev *idev;
3911
3912 /* In IPv6 pmtu discovery is not optional,
3913 so that RTAX_MTU lock cannot disable it.
3914 We still use this lock to block changes
3915 caused by addrconf/ndisc.
3916 */
3917
3918 idev = __in6_dev_get(arg->dev);
38308473 3919 if (!idev)
1da177e4
LT
3920 return 0;
3921
3922 /* For administrative MTU increase, there is no way to discover
3923 IPv6 PMTU increase, so PMTU increase should be updated here.
3924 Since RFC 1981 doesn't include administrative MTU increase
3925 update PMTU increase is a MUST. (i.e. jumbo frame)
3926 */
d1918542 3927 if (rt->dst.dev == arg->dev &&
4b32b5ad 3928 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
f5bbe7ee 3929 spin_lock_bh(&rt6_exception_lock);
e9fa1495
SB
3930 if (dst_metric_raw(&rt->dst, RTAX_MTU) &&
3931 rt6_mtu_change_route_allowed(idev, rt, arg->mtu))
4b32b5ad 3932 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
e9fa1495 3933 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
f5bbe7ee 3934 spin_unlock_bh(&rt6_exception_lock);
566cfd8f 3935 }
1da177e4
LT
3936 return 0;
3937}
3938
95c96174 3939void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
1da177e4 3940{
c71099ac
TG
3941 struct rt6_mtu_change_arg arg = {
3942 .dev = dev,
3943 .mtu = mtu,
3944 };
1da177e4 3945
0c3584d5 3946 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
1da177e4
LT
3947}
3948
ef7c79ed 3949static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
5176f91e 3950 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
86872cb5 3951 [RTA_OIF] = { .type = NLA_U32 },
ab364a6f 3952 [RTA_IIF] = { .type = NLA_U32 },
86872cb5
TG
3953 [RTA_PRIORITY] = { .type = NLA_U32 },
3954 [RTA_METRICS] = { .type = NLA_NESTED },
51ebd318 3955 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
c78ba6d6 3956 [RTA_PREF] = { .type = NLA_U8 },
19e42e45
RP
3957 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
3958 [RTA_ENCAP] = { .type = NLA_NESTED },
32bc201e 3959 [RTA_EXPIRES] = { .type = NLA_U32 },
622ec2c9 3960 [RTA_UID] = { .type = NLA_U32 },
3b45a410 3961 [RTA_MARK] = { .type = NLA_U32 },
86872cb5
TG
3962};
3963
3964static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
333c4301
DA
3965 struct fib6_config *cfg,
3966 struct netlink_ext_ack *extack)
1da177e4 3967{
86872cb5
TG
3968 struct rtmsg *rtm;
3969 struct nlattr *tb[RTA_MAX+1];
c78ba6d6 3970 unsigned int pref;
86872cb5 3971 int err;
1da177e4 3972
fceb6435
JB
3973 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3974 NULL);
86872cb5
TG
3975 if (err < 0)
3976 goto errout;
1da177e4 3977
86872cb5
TG
3978 err = -EINVAL;
3979 rtm = nlmsg_data(nlh);
3980 memset(cfg, 0, sizeof(*cfg));
3981
3982 cfg->fc_table = rtm->rtm_table;
3983 cfg->fc_dst_len = rtm->rtm_dst_len;
3984 cfg->fc_src_len = rtm->rtm_src_len;
3985 cfg->fc_flags = RTF_UP;
3986 cfg->fc_protocol = rtm->rtm_protocol;
ef2c7d7b 3987 cfg->fc_type = rtm->rtm_type;
86872cb5 3988
ef2c7d7b
ND
3989 if (rtm->rtm_type == RTN_UNREACHABLE ||
3990 rtm->rtm_type == RTN_BLACKHOLE ||
b4949ab2
ND
3991 rtm->rtm_type == RTN_PROHIBIT ||
3992 rtm->rtm_type == RTN_THROW)
86872cb5
TG
3993 cfg->fc_flags |= RTF_REJECT;
3994
ab79ad14
3995 if (rtm->rtm_type == RTN_LOCAL)
3996 cfg->fc_flags |= RTF_LOCAL;
3997
1f56a01f
MKL
3998 if (rtm->rtm_flags & RTM_F_CLONED)
3999 cfg->fc_flags |= RTF_CACHE;
4000
fc1e64e1
DA
4001 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4002
15e47304 4003 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
86872cb5 4004 cfg->fc_nlinfo.nlh = nlh;
3b1e0a65 4005 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
86872cb5
TG
4006
4007 if (tb[RTA_GATEWAY]) {
67b61f6c 4008 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
86872cb5 4009 cfg->fc_flags |= RTF_GATEWAY;
1da177e4 4010 }
86872cb5
TG
4011
4012 if (tb[RTA_DST]) {
4013 int plen = (rtm->rtm_dst_len + 7) >> 3;
4014
4015 if (nla_len(tb[RTA_DST]) < plen)
4016 goto errout;
4017
4018 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1da177e4 4019 }
86872cb5
TG
4020
4021 if (tb[RTA_SRC]) {
4022 int plen = (rtm->rtm_src_len + 7) >> 3;
4023
4024 if (nla_len(tb[RTA_SRC]) < plen)
4025 goto errout;
4026
4027 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1da177e4 4028 }
86872cb5 4029
c3968a85 4030 if (tb[RTA_PREFSRC])
67b61f6c 4031 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
c3968a85 4032
86872cb5
TG
4033 if (tb[RTA_OIF])
4034 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4035
4036 if (tb[RTA_PRIORITY])
4037 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4038
4039 if (tb[RTA_METRICS]) {
4040 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4041 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1da177e4 4042 }
86872cb5
TG
4043
4044 if (tb[RTA_TABLE])
4045 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4046
51ebd318
ND
4047 if (tb[RTA_MULTIPATH]) {
4048 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4049 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
9ed59592
DA
4050
4051 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
c255bd68 4052 cfg->fc_mp_len, extack);
9ed59592
DA
4053 if (err < 0)
4054 goto errout;
51ebd318
ND
4055 }
4056
c78ba6d6
LR
4057 if (tb[RTA_PREF]) {
4058 pref = nla_get_u8(tb[RTA_PREF]);
4059 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4060 pref != ICMPV6_ROUTER_PREF_HIGH)
4061 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4062 cfg->fc_flags |= RTF_PREF(pref);
4063 }
4064
19e42e45
RP
4065 if (tb[RTA_ENCAP])
4066 cfg->fc_encap = tb[RTA_ENCAP];
4067
9ed59592 4068 if (tb[RTA_ENCAP_TYPE]) {
19e42e45
RP
4069 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4070
c255bd68 4071 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
9ed59592
DA
4072 if (err < 0)
4073 goto errout;
4074 }
4075
32bc201e
XL
4076 if (tb[RTA_EXPIRES]) {
4077 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4078
4079 if (addrconf_finite_timeout(timeout)) {
4080 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4081 cfg->fc_flags |= RTF_EXPIRES;
4082 }
4083 }
4084
86872cb5
TG
4085 err = 0;
4086errout:
4087 return err;
1da177e4
LT
4088}
4089
6b9ea5a6
RP
4090struct rt6_nh {
4091 struct rt6_info *rt6_info;
4092 struct fib6_config r_cfg;
4093 struct mx6_config mxc;
4094 struct list_head next;
4095};
4096
4097static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4098{
4099 struct rt6_nh *nh;
4100
4101 list_for_each_entry(nh, rt6_nh_list, next) {
7d4d5065 4102 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
6b9ea5a6
RP
4103 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4104 nh->r_cfg.fc_ifindex);
4105 }
4106}
4107
4108static int ip6_route_info_append(struct list_head *rt6_nh_list,
4109 struct rt6_info *rt, struct fib6_config *r_cfg)
4110{
4111 struct rt6_nh *nh;
6b9ea5a6
RP
4112 int err = -EEXIST;
4113
4114 list_for_each_entry(nh, rt6_nh_list, next) {
4115 /* check if rt6_info already exists */
f06b7549 4116 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
6b9ea5a6
RP
4117 return err;
4118 }
4119
4120 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4121 if (!nh)
4122 return -ENOMEM;
4123 nh->rt6_info = rt;
4124 err = ip6_convert_metrics(&nh->mxc, r_cfg);
4125 if (err) {
4126 kfree(nh);
4127 return err;
4128 }
4129 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4130 list_add_tail(&nh->next, rt6_nh_list);
4131
4132 return 0;
4133}
4134
3b1137fe
DA
4135static void ip6_route_mpath_notify(struct rt6_info *rt,
4136 struct rt6_info *rt_last,
4137 struct nl_info *info,
4138 __u16 nlflags)
4139{
4140 /* if this is an APPEND route, then rt points to the first route
4141 * inserted and rt_last points to last route inserted. Userspace
4142 * wants a consistent dump of the route which starts at the first
4143 * nexthop. Since sibling routes are always added at the end of
4144 * the list, find the first sibling of the last route appended
4145 */
4146 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4147 rt = list_first_entry(&rt_last->rt6i_siblings,
4148 struct rt6_info,
4149 rt6i_siblings);
4150 }
4151
4152 if (rt)
4153 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4154}
4155
333c4301
DA
4156static int ip6_route_multipath_add(struct fib6_config *cfg,
4157 struct netlink_ext_ack *extack)
51ebd318 4158{
3b1137fe
DA
4159 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4160 struct nl_info *info = &cfg->fc_nlinfo;
51ebd318
ND
4161 struct fib6_config r_cfg;
4162 struct rtnexthop *rtnh;
6b9ea5a6
RP
4163 struct rt6_info *rt;
4164 struct rt6_nh *err_nh;
4165 struct rt6_nh *nh, *nh_safe;
3b1137fe 4166 __u16 nlflags;
51ebd318
ND
4167 int remaining;
4168 int attrlen;
6b9ea5a6
RP
4169 int err = 1;
4170 int nhn = 0;
4171 int replace = (cfg->fc_nlinfo.nlh &&
4172 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4173 LIST_HEAD(rt6_nh_list);
51ebd318 4174
3b1137fe
DA
4175 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4176 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4177 nlflags |= NLM_F_APPEND;
4178
35f1b4e9 4179 remaining = cfg->fc_mp_len;
51ebd318 4180 rtnh = (struct rtnexthop *)cfg->fc_mp;
51ebd318 4181
6b9ea5a6
RP
4182 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4183 * rt6_info structs per nexthop
4184 */
51ebd318
ND
4185 while (rtnh_ok(rtnh, remaining)) {
4186 memcpy(&r_cfg, cfg, sizeof(*cfg));
4187 if (rtnh->rtnh_ifindex)
4188 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4189
4190 attrlen = rtnh_attrlen(rtnh);
4191 if (attrlen > 0) {
4192 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4193
4194 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4195 if (nla) {
67b61f6c 4196 r_cfg.fc_gateway = nla_get_in6_addr(nla);
51ebd318
ND
4197 r_cfg.fc_flags |= RTF_GATEWAY;
4198 }
19e42e45
RP
4199 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4200 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4201 if (nla)
4202 r_cfg.fc_encap_type = nla_get_u16(nla);
51ebd318 4203 }
6b9ea5a6 4204
68e2ffde 4205 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
333c4301 4206 rt = ip6_route_info_create(&r_cfg, extack);
8c5b83f0
RP
4207 if (IS_ERR(rt)) {
4208 err = PTR_ERR(rt);
4209 rt = NULL;
6b9ea5a6 4210 goto cleanup;
8c5b83f0 4211 }
6b9ea5a6 4212
398958ae
IS
4213 rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4214
6b9ea5a6 4215 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
51ebd318 4216 if (err) {
587fea74 4217 dst_release_immediate(&rt->dst);
6b9ea5a6
RP
4218 goto cleanup;
4219 }
4220
4221 rtnh = rtnh_next(rtnh, &remaining);
4222 }
4223
3b1137fe
DA
4224 /* for add and replace send one notification with all nexthops.
4225 * Skip the notification in fib6_add_rt2node and send one with
4226 * the full route when done
4227 */
4228 info->skip_notify = 1;
4229
6b9ea5a6
RP
4230 err_nh = NULL;
4231 list_for_each_entry(nh, &rt6_nh_list, next) {
3b1137fe 4232 rt_last = nh->rt6_info;
333c4301 4233 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3b1137fe
DA
4234 /* save reference to first route for notification */
4235 if (!rt_notif && !err)
4236 rt_notif = nh->rt6_info;
4237
6b9ea5a6
RP
4238 /* nh->rt6_info is used or freed at this point, reset to NULL*/
4239 nh->rt6_info = NULL;
4240 if (err) {
4241 if (replace && nhn)
4242 ip6_print_replace_route_err(&rt6_nh_list);
4243 err_nh = nh;
4244 goto add_errout;
51ebd318 4245 }
6b9ea5a6 4246
1a72418b 4247 /* Because each route is added like a single route we remove
27596472
MK
4248 * these flags after the first nexthop: if there is a collision,
4249 * we have already failed to add the first nexthop:
4250 * fib6_add_rt2node() has rejected it; when replacing, old
4251 * nexthops have been replaced by first new, the rest should
4252 * be added to it.
1a72418b 4253 */
27596472
MK
4254 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4255 NLM_F_REPLACE);
6b9ea5a6
RP
4256 nhn++;
4257 }
4258
3b1137fe
DA
4259 /* success ... tell user about new route */
4260 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
6b9ea5a6
RP
4261 goto cleanup;
4262
4263add_errout:
3b1137fe
DA
4264 /* send notification for routes that were added so that
4265 * the delete notifications sent by ip6_route_del are
4266 * coherent
4267 */
4268 if (rt_notif)
4269 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4270
6b9ea5a6
RP
4271 /* Delete routes that were already added */
4272 list_for_each_entry(nh, &rt6_nh_list, next) {
4273 if (err_nh == nh)
4274 break;
333c4301 4275 ip6_route_del(&nh->r_cfg, extack);
6b9ea5a6
RP
4276 }
4277
4278cleanup:
4279 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
587fea74
WW
4280 if (nh->rt6_info)
4281 dst_release_immediate(&nh->rt6_info->dst);
52fe51f8 4282 kfree(nh->mxc.mx);
6b9ea5a6
RP
4283 list_del(&nh->next);
4284 kfree(nh);
4285 }
4286
4287 return err;
4288}
4289
333c4301
DA
4290static int ip6_route_multipath_del(struct fib6_config *cfg,
4291 struct netlink_ext_ack *extack)
6b9ea5a6
RP
4292{
4293 struct fib6_config r_cfg;
4294 struct rtnexthop *rtnh;
4295 int remaining;
4296 int attrlen;
4297 int err = 1, last_err = 0;
4298
4299 remaining = cfg->fc_mp_len;
4300 rtnh = (struct rtnexthop *)cfg->fc_mp;
4301
4302 /* Parse a Multipath Entry */
4303 while (rtnh_ok(rtnh, remaining)) {
4304 memcpy(&r_cfg, cfg, sizeof(*cfg));
4305 if (rtnh->rtnh_ifindex)
4306 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4307
4308 attrlen = rtnh_attrlen(rtnh);
4309 if (attrlen > 0) {
4310 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4311
4312 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4313 if (nla) {
4314 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4315 r_cfg.fc_flags |= RTF_GATEWAY;
4316 }
4317 }
333c4301 4318 err = ip6_route_del(&r_cfg, extack);
6b9ea5a6
RP
4319 if (err)
4320 last_err = err;
4321
51ebd318
ND
4322 rtnh = rtnh_next(rtnh, &remaining);
4323 }
4324
4325 return last_err;
4326}
4327
c21ef3e3
DA
4328static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4329 struct netlink_ext_ack *extack)
1da177e4 4330{
86872cb5
TG
4331 struct fib6_config cfg;
4332 int err;
1da177e4 4333
333c4301 4334 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
86872cb5
TG
4335 if (err < 0)
4336 return err;
4337
51ebd318 4338 if (cfg.fc_mp)
333c4301 4339 return ip6_route_multipath_del(&cfg, extack);
0ae81335
DA
4340 else {
4341 cfg.fc_delete_all_nh = 1;
333c4301 4342 return ip6_route_del(&cfg, extack);
0ae81335 4343 }
1da177e4
LT
4344}
4345
c21ef3e3
DA
4346static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4347 struct netlink_ext_ack *extack)
1da177e4 4348{
86872cb5
TG
4349 struct fib6_config cfg;
4350 int err;
1da177e4 4351
333c4301 4352 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
86872cb5
TG
4353 if (err < 0)
4354 return err;
4355
51ebd318 4356 if (cfg.fc_mp)
333c4301 4357 return ip6_route_multipath_add(&cfg, extack);
51ebd318 4358 else
333c4301 4359 return ip6_route_add(&cfg, extack);
1da177e4
LT
4360}
4361
beb1afac 4362static size_t rt6_nlmsg_size(struct rt6_info *rt)
339bf98f 4363{
beb1afac
DA
4364 int nexthop_len = 0;
4365
4366 if (rt->rt6i_nsiblings) {
4367 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4368 + NLA_ALIGN(sizeof(struct rtnexthop))
4369 + nla_total_size(16) /* RTA_GATEWAY */
beb1afac
DA
4370 + lwtunnel_get_encap_size(rt->dst.lwtstate);
4371
4372 nexthop_len *= rt->rt6i_nsiblings;
4373 }
4374
339bf98f
TG
4375 return NLMSG_ALIGN(sizeof(struct rtmsg))
4376 + nla_total_size(16) /* RTA_SRC */
4377 + nla_total_size(16) /* RTA_DST */
4378 + nla_total_size(16) /* RTA_GATEWAY */
4379 + nla_total_size(16) /* RTA_PREFSRC */
4380 + nla_total_size(4) /* RTA_TABLE */
4381 + nla_total_size(4) /* RTA_IIF */
4382 + nla_total_size(4) /* RTA_OIF */
4383 + nla_total_size(4) /* RTA_PRIORITY */
6a2b9ce0 4384 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
ea697639 4385 + nla_total_size(sizeof(struct rta_cacheinfo))
c78ba6d6 4386 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
19e42e45 4387 + nla_total_size(1) /* RTA_PREF */
beb1afac
DA
4388 + lwtunnel_get_encap_size(rt->dst.lwtstate)
4389 + nexthop_len;
4390}
4391
4392static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
5be083ce 4393 unsigned int *flags, bool skip_oif)
beb1afac 4394{
f9d882ea
IS
4395 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4396 *flags |= RTNH_F_DEAD;
4397
44c9f2f2 4398 if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
beb1afac
DA
4399 *flags |= RTNH_F_LINKDOWN;
4400 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4401 *flags |= RTNH_F_DEAD;
4402 }
4403
4404 if (rt->rt6i_flags & RTF_GATEWAY) {
4405 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4406 goto nla_put_failure;
4407 }
4408
fc1e64e1 4409 *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
fe400799 4410 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
61e4d01e
IS
4411 *flags |= RTNH_F_OFFLOAD;
4412
5be083ce
DA
4413 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4414 if (!skip_oif && rt->dst.dev &&
beb1afac
DA
4415 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4416 goto nla_put_failure;
4417
4418 if (rt->dst.lwtstate &&
4419 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4420 goto nla_put_failure;
4421
4422 return 0;
4423
4424nla_put_failure:
4425 return -EMSGSIZE;
4426}
4427
5be083ce 4428/* add multipath next hop */
beb1afac
DA
4429static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4430{
4431 struct rtnexthop *rtnh;
4432 unsigned int flags = 0;
4433
4434 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4435 if (!rtnh)
4436 goto nla_put_failure;
4437
398958ae 4438 rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
beb1afac
DA
4439 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4440
5be083ce 4441 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
beb1afac
DA
4442 goto nla_put_failure;
4443
4444 rtnh->rtnh_flags = flags;
4445
4446 /* length of rtnetlink header + attributes */
4447 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4448
4449 return 0;
4450
4451nla_put_failure:
4452 return -EMSGSIZE;
339bf98f
TG
4453}
4454
191cd582
BH
4455static int rt6_fill_node(struct net *net,
4456 struct sk_buff *skb, struct rt6_info *rt,
0d51aa80 4457 struct in6_addr *dst, struct in6_addr *src,
15e47304 4458 int iif, int type, u32 portid, u32 seq,
f8cfe2ce 4459 unsigned int flags)
1da177e4 4460{
4b32b5ad 4461 u32 metrics[RTAX_MAX];
1da177e4 4462 struct rtmsg *rtm;
2d7202bf 4463 struct nlmsghdr *nlh;
e3703b3d 4464 long expires;
9e762a4a 4465 u32 table;
1da177e4 4466
15e47304 4467 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
38308473 4468 if (!nlh)
26932566 4469 return -EMSGSIZE;
2d7202bf
TG
4470
4471 rtm = nlmsg_data(nlh);
1da177e4
LT
4472 rtm->rtm_family = AF_INET6;
4473 rtm->rtm_dst_len = rt->rt6i_dst.plen;
4474 rtm->rtm_src_len = rt->rt6i_src.plen;
4475 rtm->rtm_tos = 0;
c71099ac 4476 if (rt->rt6i_table)
9e762a4a 4477 table = rt->rt6i_table->tb6_id;
c71099ac 4478 else
9e762a4a
PM
4479 table = RT6_TABLE_UNSPEC;
4480 rtm->rtm_table = table;
c78679e8
DM
4481 if (nla_put_u32(skb, RTA_TABLE, table))
4482 goto nla_put_failure;
ef2c7d7b
ND
4483 if (rt->rt6i_flags & RTF_REJECT) {
4484 switch (rt->dst.error) {
4485 case -EINVAL:
4486 rtm->rtm_type = RTN_BLACKHOLE;
4487 break;
4488 case -EACCES:
4489 rtm->rtm_type = RTN_PROHIBIT;
4490 break;
b4949ab2
ND
4491 case -EAGAIN:
4492 rtm->rtm_type = RTN_THROW;
4493 break;
ef2c7d7b
ND
4494 default:
4495 rtm->rtm_type = RTN_UNREACHABLE;
4496 break;
4497 }
4498 }
38308473 4499 else if (rt->rt6i_flags & RTF_LOCAL)
ab79ad14 4500 rtm->rtm_type = RTN_LOCAL;
4ee39733
DA
4501 else if (rt->rt6i_flags & RTF_ANYCAST)
4502 rtm->rtm_type = RTN_ANYCAST;
d1918542 4503 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
1da177e4
LT
4504 rtm->rtm_type = RTN_LOCAL;
4505 else
4506 rtm->rtm_type = RTN_UNICAST;
4507 rtm->rtm_flags = 0;
4508 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4509 rtm->rtm_protocol = rt->rt6i_protocol;
1da177e4 4510
38308473 4511 if (rt->rt6i_flags & RTF_CACHE)
1da177e4
LT
4512 rtm->rtm_flags |= RTM_F_CLONED;
4513
4514 if (dst) {
930345ea 4515 if (nla_put_in6_addr(skb, RTA_DST, dst))
c78679e8 4516 goto nla_put_failure;
1ab1457c 4517 rtm->rtm_dst_len = 128;
1da177e4 4518 } else if (rtm->rtm_dst_len)
930345ea 4519 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
c78679e8 4520 goto nla_put_failure;
1da177e4
LT
4521#ifdef CONFIG_IPV6_SUBTREES
4522 if (src) {
930345ea 4523 if (nla_put_in6_addr(skb, RTA_SRC, src))
c78679e8 4524 goto nla_put_failure;
1ab1457c 4525 rtm->rtm_src_len = 128;
c78679e8 4526 } else if (rtm->rtm_src_len &&
930345ea 4527 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
c78679e8 4528 goto nla_put_failure;
1da177e4 4529#endif
7bc570c8
YH
4530 if (iif) {
4531#ifdef CONFIG_IPV6_MROUTE
4532 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
fd61c6ba
DA
4533 int err = ip6mr_get_route(net, skb, rtm, portid);
4534
4535 if (err == 0)
4536 return 0;
4537 if (err < 0)
4538 goto nla_put_failure;
7bc570c8
YH
4539 } else
4540#endif
c78679e8
DM
4541 if (nla_put_u32(skb, RTA_IIF, iif))
4542 goto nla_put_failure;
7bc570c8 4543 } else if (dst) {
1da177e4 4544 struct in6_addr saddr_buf;
c78679e8 4545 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
930345ea 4546 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
c78679e8 4547 goto nla_put_failure;
1da177e4 4548 }
2d7202bf 4549
c3968a85
DW
4550 if (rt->rt6i_prefsrc.plen) {
4551 struct in6_addr saddr_buf;
4e3fd7a0 4552 saddr_buf = rt->rt6i_prefsrc.addr;
930345ea 4553 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
c78679e8 4554 goto nla_put_failure;
c3968a85
DW
4555 }
4556
4b32b5ad
MKL
4557 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4558 if (rt->rt6i_pmtu)
4559 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4560 if (rtnetlink_put_metrics(skb, metrics) < 0)
2d7202bf
TG
4561 goto nla_put_failure;
4562
c78679e8
DM
4563 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4564 goto nla_put_failure;
8253947e 4565
beb1afac
DA
4566 /* For multipath routes, walk the siblings list and add
4567 * each as a nexthop within RTA_MULTIPATH.
4568 */
4569 if (rt->rt6i_nsiblings) {
4570 struct rt6_info *sibling, *next_sibling;
4571 struct nlattr *mp;
4572
4573 mp = nla_nest_start(skb, RTA_MULTIPATH);
4574 if (!mp)
4575 goto nla_put_failure;
4576
4577 if (rt6_add_nexthop(skb, rt) < 0)
4578 goto nla_put_failure;
4579
4580 list_for_each_entry_safe(sibling, next_sibling,
4581 &rt->rt6i_siblings, rt6i_siblings) {
4582 if (rt6_add_nexthop(skb, sibling) < 0)
4583 goto nla_put_failure;
4584 }
4585
4586 nla_nest_end(skb, mp);
4587 } else {
5be083ce 4588 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
beb1afac
DA
4589 goto nla_put_failure;
4590 }
4591
8253947e 4592 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
69cdf8f9 4593
87a50699 4594 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
e3703b3d 4595 goto nla_put_failure;
2d7202bf 4596
c78ba6d6
LR
4597 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4598 goto nla_put_failure;
4599
19e42e45 4600
053c095a
JB
4601 nlmsg_end(skb, nlh);
4602 return 0;
2d7202bf
TG
4603
4604nla_put_failure:
26932566
PM
4605 nlmsg_cancel(skb, nlh);
4606 return -EMSGSIZE;
1da177e4
LT
4607}
4608
1b43af54 4609int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1da177e4
LT
4610{
4611 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1f17e2f2
DA
4612 struct net *net = arg->net;
4613
4614 if (rt == net->ipv6.ip6_null_entry)
4615 return 0;
1da177e4 4616
2d7202bf
TG
4617 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4618 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
f8cfe2ce
DA
4619
4620 /* user wants prefix routes only */
4621 if (rtm->rtm_flags & RTM_F_PREFIX &&
4622 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4623 /* success since this is not a prefix route */
4624 return 1;
4625 }
4626 }
1da177e4 4627
1f17e2f2 4628 return rt6_fill_node(net,
191cd582 4629 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
15e47304 4630 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
f8cfe2ce 4631 NLM_F_MULTI);
1da177e4
LT
4632}
4633
c21ef3e3
DA
4634static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4635 struct netlink_ext_ack *extack)
1da177e4 4636{
3b1e0a65 4637 struct net *net = sock_net(in_skb->sk);
ab364a6f 4638 struct nlattr *tb[RTA_MAX+1];
18c3a61c
RP
4639 int err, iif = 0, oif = 0;
4640 struct dst_entry *dst;
ab364a6f 4641 struct rt6_info *rt;
1da177e4 4642 struct sk_buff *skb;
ab364a6f 4643 struct rtmsg *rtm;
4c9483b2 4644 struct flowi6 fl6;
18c3a61c 4645 bool fibmatch;
1da177e4 4646
fceb6435 4647 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
c21ef3e3 4648 extack);
ab364a6f
TG
4649 if (err < 0)
4650 goto errout;
1da177e4 4651
ab364a6f 4652 err = -EINVAL;
4c9483b2 4653 memset(&fl6, 0, sizeof(fl6));
38b7097b
HFS
4654 rtm = nlmsg_data(nlh);
4655 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
18c3a61c 4656 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
1da177e4 4657
ab364a6f
TG
4658 if (tb[RTA_SRC]) {
4659 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4660 goto errout;
4661
4e3fd7a0 4662 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
ab364a6f
TG
4663 }
4664
4665 if (tb[RTA_DST]) {
4666 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4667 goto errout;
4668
4e3fd7a0 4669 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
ab364a6f
TG
4670 }
4671
4672 if (tb[RTA_IIF])
4673 iif = nla_get_u32(tb[RTA_IIF]);
4674
4675 if (tb[RTA_OIF])
72331bc0 4676 oif = nla_get_u32(tb[RTA_OIF]);
1da177e4 4677
2e47b291
LC
4678 if (tb[RTA_MARK])
4679 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4680
622ec2c9
LC
4681 if (tb[RTA_UID])
4682 fl6.flowi6_uid = make_kuid(current_user_ns(),
4683 nla_get_u32(tb[RTA_UID]));
4684 else
4685 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4686
1da177e4
LT
4687 if (iif) {
4688 struct net_device *dev;
72331bc0
SL
4689 int flags = 0;
4690
121622db
FW
4691 rcu_read_lock();
4692
4693 dev = dev_get_by_index_rcu(net, iif);
1da177e4 4694 if (!dev) {
121622db 4695 rcu_read_unlock();
1da177e4 4696 err = -ENODEV;
ab364a6f 4697 goto errout;
1da177e4 4698 }
72331bc0
SL
4699
4700 fl6.flowi6_iif = iif;
4701
4702 if (!ipv6_addr_any(&fl6.saddr))
4703 flags |= RT6_LOOKUP_F_HAS_SADDR;
4704
b75cc8f9 4705 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
121622db
FW
4706
4707 rcu_read_unlock();
72331bc0
SL
4708 } else {
4709 fl6.flowi6_oif = oif;
4710
58acfd71 4711 dst = ip6_route_output(net, NULL, &fl6);
18c3a61c
RP
4712 }
4713
18c3a61c
RP
4714
4715 rt = container_of(dst, struct rt6_info, dst);
4716 if (rt->dst.error) {
4717 err = rt->dst.error;
4718 ip6_rt_put(rt);
4719 goto errout;
1da177e4
LT
4720 }
4721
9d6acb3b
WC
4722 if (rt == net->ipv6.ip6_null_entry) {
4723 err = rt->dst.error;
4724 ip6_rt_put(rt);
4725 goto errout;
4726 }
4727
fba961ab
DM
4728 if (fibmatch && rt->from) {
4729 struct rt6_info *ort = rt->from;
58acfd71
IS
4730
4731 dst_hold(&ort->dst);
4732 ip6_rt_put(rt);
4733 rt = ort;
4734 }
4735
ab364a6f 4736 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
38308473 4737 if (!skb) {
94e187c0 4738 ip6_rt_put(rt);
ab364a6f
TG
4739 err = -ENOBUFS;
4740 goto errout;
4741 }
1da177e4 4742
d8d1f30b 4743 skb_dst_set(skb, &rt->dst);
18c3a61c
RP
4744 if (fibmatch)
4745 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4746 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4747 nlh->nlmsg_seq, 0);
4748 else
4749 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4750 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4751 nlh->nlmsg_seq, 0);
1da177e4 4752 if (err < 0) {
ab364a6f
TG
4753 kfree_skb(skb);
4754 goto errout;
1da177e4
LT
4755 }
4756
15e47304 4757 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
ab364a6f 4758errout:
1da177e4 4759 return err;
1da177e4
LT
4760}
4761
37a1d361
RP
4762void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4763 unsigned int nlm_flags)
1da177e4
LT
4764{
4765 struct sk_buff *skb;
5578689a 4766 struct net *net = info->nl_net;
528c4ceb
DL
4767 u32 seq;
4768 int err;
4769
4770 err = -ENOBUFS;
38308473 4771 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
86872cb5 4772
19e42e45 4773 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
38308473 4774 if (!skb)
21713ebc
TG
4775 goto errout;
4776
191cd582 4777 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
f8cfe2ce 4778 event, info->portid, seq, nlm_flags);
26932566
PM
4779 if (err < 0) {
4780 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4781 WARN_ON(err == -EMSGSIZE);
4782 kfree_skb(skb);
4783 goto errout;
4784 }
15e47304 4785 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
1ce85fe4
PNA
4786 info->nlh, gfp_any());
4787 return;
21713ebc
TG
4788errout:
4789 if (err < 0)
5578689a 4790 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
1da177e4
LT
4791}
4792
8ed67789 4793static int ip6_route_dev_notify(struct notifier_block *this,
351638e7 4794 unsigned long event, void *ptr)
8ed67789 4795{
351638e7 4796 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 4797 struct net *net = dev_net(dev);
8ed67789 4798
242d3a49
WC
4799 if (!(dev->flags & IFF_LOOPBACK))
4800 return NOTIFY_OK;
4801
4802 if (event == NETDEV_REGISTER) {
d8d1f30b 4803 net->ipv6.ip6_null_entry->dst.dev = dev;
8ed67789
DL
4804 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4805#ifdef CONFIG_IPV6_MULTIPLE_TABLES
d8d1f30b 4806 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
8ed67789 4807 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
d8d1f30b 4808 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
8ed67789 4809 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
242d3a49 4810#endif
76da0704
WC
4811 } else if (event == NETDEV_UNREGISTER &&
4812 dev->reg_state != NETREG_UNREGISTERED) {
4813 /* NETDEV_UNREGISTER could be fired for multiple times by
4814 * netdev_wait_allrefs(). Make sure we only call this once.
4815 */
12d94a80 4816 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
242d3a49 4817#ifdef CONFIG_IPV6_MULTIPLE_TABLES
12d94a80
ED
4818 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4819 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
8ed67789
DL
4820#endif
4821 }
4822
4823 return NOTIFY_OK;
4824}
4825
1da177e4
LT
4826/*
4827 * /proc
4828 */
4829
4830#ifdef CONFIG_PROC_FS
4831
33120b30 4832static const struct file_operations ipv6_route_proc_fops = {
33120b30
AD
4833 .open = ipv6_route_open,
4834 .read = seq_read,
4835 .llseek = seq_lseek,
8d2ca1d7 4836 .release = seq_release_net,
33120b30
AD
4837};
4838
1da177e4
LT
4839static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4840{
69ddb805 4841 struct net *net = (struct net *)seq->private;
1da177e4 4842 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
69ddb805
DL
4843 net->ipv6.rt6_stats->fib_nodes,
4844 net->ipv6.rt6_stats->fib_route_nodes,
81eb8447 4845 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
69ddb805
DL
4846 net->ipv6.rt6_stats->fib_rt_entries,
4847 net->ipv6.rt6_stats->fib_rt_cache,
fc66f95c 4848 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
69ddb805 4849 net->ipv6.rt6_stats->fib_discarded_routes);
1da177e4
LT
4850
4851 return 0;
4852}
4853
4854static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4855{
de05c557 4856 return single_open_net(inode, file, rt6_stats_seq_show);
69ddb805
DL
4857}
4858
9a32144e 4859static const struct file_operations rt6_stats_seq_fops = {
1da177e4
LT
4860 .open = rt6_stats_seq_open,
4861 .read = seq_read,
4862 .llseek = seq_lseek,
b6fcbdb4 4863 .release = single_release_net,
1da177e4
LT
4864};
4865#endif /* CONFIG_PROC_FS */
4866
4867#ifdef CONFIG_SYSCTL
4868
1da177e4 4869static
fe2c6338 4870int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
1da177e4
LT
4871 void __user *buffer, size_t *lenp, loff_t *ppos)
4872{
c486da34
LAG
4873 struct net *net;
4874 int delay;
4875 if (!write)
1da177e4 4876 return -EINVAL;
c486da34
LAG
4877
4878 net = (struct net *)ctl->extra1;
4879 delay = net->ipv6.sysctl.flush_delay;
4880 proc_dointvec(ctl, write, buffer, lenp, ppos);
2ac3ac8f 4881 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
c486da34 4882 return 0;
1da177e4
LT
4883}
4884
fe2c6338 4885struct ctl_table ipv6_route_table_template[] = {
1ab1457c 4886 {
1da177e4 4887 .procname = "flush",
4990509f 4888 .data = &init_net.ipv6.sysctl.flush_delay,
1da177e4 4889 .maxlen = sizeof(int),
89c8b3a1 4890 .mode = 0200,
6d9f239a 4891 .proc_handler = ipv6_sysctl_rtcache_flush
1da177e4
LT
4892 },
4893 {
1da177e4 4894 .procname = "gc_thresh",
9a7ec3a9 4895 .data = &ip6_dst_ops_template.gc_thresh,
1da177e4
LT
4896 .maxlen = sizeof(int),
4897 .mode = 0644,
6d9f239a 4898 .proc_handler = proc_dointvec,
1da177e4
LT
4899 },
4900 {
1da177e4 4901 .procname = "max_size",
4990509f 4902 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
1da177e4
LT
4903 .maxlen = sizeof(int),
4904 .mode = 0644,
6d9f239a 4905 .proc_handler = proc_dointvec,
1da177e4
LT
4906 },
4907 {
1da177e4 4908 .procname = "gc_min_interval",
4990509f 4909 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
1da177e4
LT
4910 .maxlen = sizeof(int),
4911 .mode = 0644,
6d9f239a 4912 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
4913 },
4914 {
1da177e4 4915 .procname = "gc_timeout",
4990509f 4916 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
1da177e4
LT
4917 .maxlen = sizeof(int),
4918 .mode = 0644,
6d9f239a 4919 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
4920 },
4921 {
1da177e4 4922 .procname = "gc_interval",
4990509f 4923 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
1da177e4
LT
4924 .maxlen = sizeof(int),
4925 .mode = 0644,
6d9f239a 4926 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
4927 },
4928 {
1da177e4 4929 .procname = "gc_elasticity",
4990509f 4930 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
1da177e4
LT
4931 .maxlen = sizeof(int),
4932 .mode = 0644,
f3d3f616 4933 .proc_handler = proc_dointvec,
1da177e4
LT
4934 },
4935 {
1da177e4 4936 .procname = "mtu_expires",
4990509f 4937 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
1da177e4
LT
4938 .maxlen = sizeof(int),
4939 .mode = 0644,
6d9f239a 4940 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
4941 },
4942 {
1da177e4 4943 .procname = "min_adv_mss",
4990509f 4944 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
1da177e4
LT
4945 .maxlen = sizeof(int),
4946 .mode = 0644,
f3d3f616 4947 .proc_handler = proc_dointvec,
1da177e4
LT
4948 },
4949 {
1da177e4 4950 .procname = "gc_min_interval_ms",
4990509f 4951 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
1da177e4
LT
4952 .maxlen = sizeof(int),
4953 .mode = 0644,
6d9f239a 4954 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4 4955 },
f8572d8f 4956 { }
1da177e4
LT
4957};
4958
2c8c1e72 4959struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
760f2d01
DL
4960{
4961 struct ctl_table *table;
4962
4963 table = kmemdup(ipv6_route_table_template,
4964 sizeof(ipv6_route_table_template),
4965 GFP_KERNEL);
5ee09105
YH
4966
4967 if (table) {
4968 table[0].data = &net->ipv6.sysctl.flush_delay;
c486da34 4969 table[0].extra1 = net;
86393e52 4970 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5ee09105
YH
4971 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4972 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4973 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4974 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4975 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4976 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4977 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
9c69fabe 4978 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
464dc801
EB
4979
4980 /* Don't export sysctls to unprivileged users */
4981 if (net->user_ns != &init_user_ns)
4982 table[0].procname = NULL;
5ee09105
YH
4983 }
4984
760f2d01
DL
4985 return table;
4986}
1da177e4
LT
4987#endif
4988
2c8c1e72 4989static int __net_init ip6_route_net_init(struct net *net)
cdb18761 4990{
633d424b 4991 int ret = -ENOMEM;
8ed67789 4992
86393e52
AD
4993 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4994 sizeof(net->ipv6.ip6_dst_ops));
f2fc6a54 4995
fc66f95c
ED
4996 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4997 goto out_ip6_dst_ops;
4998
8ed67789
DL
4999 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5000 sizeof(*net->ipv6.ip6_null_entry),
5001 GFP_KERNEL);
5002 if (!net->ipv6.ip6_null_entry)
fc66f95c 5003 goto out_ip6_dst_entries;
d8d1f30b 5004 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
62fa8a84
DM
5005 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5006 ip6_template_metrics, true);
8ed67789
DL
5007
5008#ifdef CONFIG_IPV6_MULTIPLE_TABLES
feca7d8c 5009 net->ipv6.fib6_has_custom_rules = false;
8ed67789
DL
5010 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5011 sizeof(*net->ipv6.ip6_prohibit_entry),
5012 GFP_KERNEL);
68fffc67
PZ
5013 if (!net->ipv6.ip6_prohibit_entry)
5014 goto out_ip6_null_entry;
d8d1f30b 5015 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
62fa8a84
DM
5016 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5017 ip6_template_metrics, true);
8ed67789
DL
5018
5019 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5020 sizeof(*net->ipv6.ip6_blk_hole_entry),
5021 GFP_KERNEL);
68fffc67
PZ
5022 if (!net->ipv6.ip6_blk_hole_entry)
5023 goto out_ip6_prohibit_entry;
d8d1f30b 5024 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
62fa8a84
DM
5025 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5026 ip6_template_metrics, true);
8ed67789
DL
5027#endif
5028
b339a47c
PZ
5029 net->ipv6.sysctl.flush_delay = 0;
5030 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5031 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5032 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5033 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5034 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5035 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5036 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5037
6891a346
BT
5038 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5039
8ed67789
DL
5040 ret = 0;
5041out:
5042 return ret;
f2fc6a54 5043
68fffc67
PZ
5044#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5045out_ip6_prohibit_entry:
5046 kfree(net->ipv6.ip6_prohibit_entry);
5047out_ip6_null_entry:
5048 kfree(net->ipv6.ip6_null_entry);
5049#endif
fc66f95c
ED
5050out_ip6_dst_entries:
5051 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
f2fc6a54 5052out_ip6_dst_ops:
f2fc6a54 5053 goto out;
cdb18761
DL
5054}
5055
2c8c1e72 5056static void __net_exit ip6_route_net_exit(struct net *net)
cdb18761 5057{
8ed67789
DL
5058 kfree(net->ipv6.ip6_null_entry);
5059#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5060 kfree(net->ipv6.ip6_prohibit_entry);
5061 kfree(net->ipv6.ip6_blk_hole_entry);
5062#endif
41bb78b4 5063 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
cdb18761
DL
5064}
5065
d189634e
TG
5066static int __net_init ip6_route_net_init_late(struct net *net)
5067{
5068#ifdef CONFIG_PROC_FS
d4beaa66 5069 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
d6444062 5070 proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
d189634e
TG
5071#endif
5072 return 0;
5073}
5074
5075static void __net_exit ip6_route_net_exit_late(struct net *net)
5076{
5077#ifdef CONFIG_PROC_FS
ece31ffd
G
5078 remove_proc_entry("ipv6_route", net->proc_net);
5079 remove_proc_entry("rt6_stats", net->proc_net);
d189634e
TG
5080#endif
5081}
5082
cdb18761
DL
5083static struct pernet_operations ip6_route_net_ops = {
5084 .init = ip6_route_net_init,
5085 .exit = ip6_route_net_exit,
5086};
5087
c3426b47
DM
5088static int __net_init ipv6_inetpeer_init(struct net *net)
5089{
5090 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5091
5092 if (!bp)
5093 return -ENOMEM;
5094 inet_peer_base_init(bp);
5095 net->ipv6.peers = bp;
5096 return 0;
5097}
5098
5099static void __net_exit ipv6_inetpeer_exit(struct net *net)
5100{
5101 struct inet_peer_base *bp = net->ipv6.peers;
5102
5103 net->ipv6.peers = NULL;
56a6b248 5104 inetpeer_invalidate_tree(bp);
c3426b47
DM
5105 kfree(bp);
5106}
5107
2b823f72 5108static struct pernet_operations ipv6_inetpeer_ops = {
c3426b47
DM
5109 .init = ipv6_inetpeer_init,
5110 .exit = ipv6_inetpeer_exit,
5111};
5112
d189634e
TG
5113static struct pernet_operations ip6_route_net_late_ops = {
5114 .init = ip6_route_net_init_late,
5115 .exit = ip6_route_net_exit_late,
5116};
5117
8ed67789
DL
5118static struct notifier_block ip6_route_dev_notifier = {
5119 .notifier_call = ip6_route_dev_notify,
242d3a49 5120 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
8ed67789
DL
5121};
5122
2f460933
WC
5123void __init ip6_route_init_special_entries(void)
5124{
5125 /* Registering of the loopback is done before this portion of code,
5126 * the loopback reference in rt6_info will not be taken, do it
5127 * manually for init_net */
5128 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5129 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5130 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5131 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5132 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5133 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5134 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5135 #endif
5136}
5137
433d49c3 5138int __init ip6_route_init(void)
1da177e4 5139{
433d49c3 5140 int ret;
8d0b94af 5141 int cpu;
433d49c3 5142
9a7ec3a9
DL
5143 ret = -ENOMEM;
5144 ip6_dst_ops_template.kmem_cachep =
e5d679f3 5145 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
f845ab6b 5146 SLAB_HWCACHE_ALIGN, NULL);
9a7ec3a9 5147 if (!ip6_dst_ops_template.kmem_cachep)
c19a28e1 5148 goto out;
14e50e57 5149
fc66f95c 5150 ret = dst_entries_init(&ip6_dst_blackhole_ops);
8ed67789 5151 if (ret)
bdb3289f 5152 goto out_kmem_cache;
bdb3289f 5153
c3426b47
DM
5154 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5155 if (ret)
e8803b6c 5156 goto out_dst_entries;
2a0c451a 5157
7e52b33b
DM
5158 ret = register_pernet_subsys(&ip6_route_net_ops);
5159 if (ret)
5160 goto out_register_inetpeer;
c3426b47 5161
5dc121e9
AE
5162 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5163
e8803b6c 5164 ret = fib6_init();
433d49c3 5165 if (ret)
8ed67789 5166 goto out_register_subsys;
433d49c3 5167
433d49c3
DL
5168 ret = xfrm6_init();
5169 if (ret)
e8803b6c 5170 goto out_fib6_init;
c35b7e72 5171
433d49c3
DL
5172 ret = fib6_rules_init();
5173 if (ret)
5174 goto xfrm6_init;
7e5449c2 5175
d189634e
TG
5176 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5177 if (ret)
5178 goto fib6_rules_init;
5179
16feebcf
FW
5180 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5181 inet6_rtm_newroute, NULL, 0);
5182 if (ret < 0)
5183 goto out_register_late_subsys;
5184
5185 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5186 inet6_rtm_delroute, NULL, 0);
5187 if (ret < 0)
5188 goto out_register_late_subsys;
5189
5190 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5191 inet6_rtm_getroute, NULL,
5192 RTNL_FLAG_DOIT_UNLOCKED);
5193 if (ret < 0)
d189634e 5194 goto out_register_late_subsys;
c127ea2c 5195
8ed67789 5196 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
cdb18761 5197 if (ret)
d189634e 5198 goto out_register_late_subsys;
8ed67789 5199
8d0b94af
MKL
5200 for_each_possible_cpu(cpu) {
5201 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5202
5203 INIT_LIST_HEAD(&ul->head);
5204 spin_lock_init(&ul->lock);
5205 }
5206
433d49c3
DL
5207out:
5208 return ret;
5209
d189634e 5210out_register_late_subsys:
16feebcf 5211 rtnl_unregister_all(PF_INET6);
d189634e 5212 unregister_pernet_subsys(&ip6_route_net_late_ops);
433d49c3 5213fib6_rules_init:
433d49c3
DL
5214 fib6_rules_cleanup();
5215xfrm6_init:
433d49c3 5216 xfrm6_fini();
2a0c451a
TG
5217out_fib6_init:
5218 fib6_gc_cleanup();
8ed67789
DL
5219out_register_subsys:
5220 unregister_pernet_subsys(&ip6_route_net_ops);
7e52b33b
DM
5221out_register_inetpeer:
5222 unregister_pernet_subsys(&ipv6_inetpeer_ops);
fc66f95c
ED
5223out_dst_entries:
5224 dst_entries_destroy(&ip6_dst_blackhole_ops);
433d49c3 5225out_kmem_cache:
f2fc6a54 5226 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
433d49c3 5227 goto out;
1da177e4
LT
5228}
5229
5230void ip6_route_cleanup(void)
5231{
8ed67789 5232 unregister_netdevice_notifier(&ip6_route_dev_notifier);
d189634e 5233 unregister_pernet_subsys(&ip6_route_net_late_ops);
101367c2 5234 fib6_rules_cleanup();
1da177e4 5235 xfrm6_fini();
1da177e4 5236 fib6_gc_cleanup();
c3426b47 5237 unregister_pernet_subsys(&ipv6_inetpeer_ops);
8ed67789 5238 unregister_pernet_subsys(&ip6_route_net_ops);
41bb78b4 5239 dst_entries_destroy(&ip6_dst_blackhole_ops);
f2fc6a54 5240 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
1da177e4 5241}