Merge remote-tracking branches 'asoc/topic/wm8753', 'asoc/topic/wm8770', 'asoc/topic...
[linux-block.git] / net / ipv6 / route.c
... / ...
CommitLineData
1/*
2 * Linux INET6 implementation
3 * FIB front-end.
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14/* Changes:
15 *
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
25 */
26
27#define pr_fmt(fmt) "IPv6: " fmt
28
29#include <linux/capability.h>
30#include <linux/errno.h>
31#include <linux/export.h>
32#include <linux/types.h>
33#include <linux/times.h>
34#include <linux/socket.h>
35#include <linux/sockios.h>
36#include <linux/net.h>
37#include <linux/route.h>
38#include <linux/netdevice.h>
39#include <linux/in6.h>
40#include <linux/mroute6.h>
41#include <linux/init.h>
42#include <linux/if_arp.h>
43#include <linux/proc_fs.h>
44#include <linux/seq_file.h>
45#include <linux/nsproxy.h>
46#include <linux/slab.h>
47#include <linux/jhash.h>
48#include <net/net_namespace.h>
49#include <net/snmp.h>
50#include <net/ipv6.h>
51#include <net/ip6_fib.h>
52#include <net/ip6_route.h>
53#include <net/ndisc.h>
54#include <net/addrconf.h>
55#include <net/tcp.h>
56#include <linux/rtnetlink.h>
57#include <net/dst.h>
58#include <net/dst_metadata.h>
59#include <net/xfrm.h>
60#include <net/netevent.h>
61#include <net/netlink.h>
62#include <net/nexthop.h>
63#include <net/lwtunnel.h>
64#include <net/ip_tunnels.h>
65#include <net/l3mdev.h>
66#include <trace/events/fib6.h>
67
68#include <linux/uaccess.h>
69
70#ifdef CONFIG_SYSCTL
71#include <linux/sysctl.h>
72#endif
73
74enum rt6_nud_state {
75 RT6_NUD_FAIL_HARD = -3,
76 RT6_NUD_FAIL_PROBE = -2,
77 RT6_NUD_FAIL_DO_RR = -1,
78 RT6_NUD_SUCCEED = 1
79};
80
81static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83static unsigned int ip6_default_advmss(const struct dst_entry *dst);
84static unsigned int ip6_mtu(const struct dst_entry *dst);
85static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86static void ip6_dst_destroy(struct dst_entry *);
87static void ip6_dst_ifdown(struct dst_entry *,
88 struct net_device *dev, int how);
89static int ip6_dst_gc(struct dst_ops *ops);
90
91static int ip6_pkt_discard(struct sk_buff *skb);
92static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93static int ip6_pkt_prohibit(struct sk_buff *skb);
94static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95static void ip6_link_failure(struct sk_buff *skb);
96static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 struct sk_buff *skb, u32 mtu);
98static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 struct sk_buff *skb);
100static void rt6_dst_from_metrics_check(struct rt6_info *rt);
101static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102static size_t rt6_nlmsg_size(struct rt6_info *rt);
103static int rt6_fill_node(struct net *net,
104 struct sk_buff *skb, struct rt6_info *rt,
105 struct in6_addr *dst, struct in6_addr *src,
106 int iif, int type, u32 portid, u32 seq,
107 unsigned int flags);
108static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 struct in6_addr *daddr,
110 struct in6_addr *saddr);
111
112#ifdef CONFIG_IPV6_ROUTE_INFO
113static struct rt6_info *rt6_add_route_info(struct net *net,
114 const struct in6_addr *prefix, int prefixlen,
115 const struct in6_addr *gwaddr,
116 struct net_device *dev,
117 unsigned int pref);
118static struct rt6_info *rt6_get_route_info(struct net *net,
119 const struct in6_addr *prefix, int prefixlen,
120 const struct in6_addr *gwaddr,
121 struct net_device *dev);
122#endif
123
124struct uncached_list {
125 spinlock_t lock;
126 struct list_head head;
127};
128
129static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131void rt6_uncached_list_add(struct rt6_info *rt)
132{
133 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135 rt->rt6i_uncached_list = ul;
136
137 spin_lock_bh(&ul->lock);
138 list_add_tail(&rt->rt6i_uncached, &ul->head);
139 spin_unlock_bh(&ul->lock);
140}
141
142void rt6_uncached_list_del(struct rt6_info *rt)
143{
144 if (!list_empty(&rt->rt6i_uncached)) {
145 struct uncached_list *ul = rt->rt6i_uncached_list;
146 struct net *net = dev_net(rt->dst.dev);
147
148 spin_lock_bh(&ul->lock);
149 list_del(&rt->rt6i_uncached);
150 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151 spin_unlock_bh(&ul->lock);
152 }
153}
154
155static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156{
157 struct net_device *loopback_dev = net->loopback_dev;
158 int cpu;
159
160 if (dev == loopback_dev)
161 return;
162
163 for_each_possible_cpu(cpu) {
164 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 struct rt6_info *rt;
166
167 spin_lock_bh(&ul->lock);
168 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169 struct inet6_dev *rt_idev = rt->rt6i_idev;
170 struct net_device *rt_dev = rt->dst.dev;
171
172 if (rt_idev->dev == dev) {
173 rt->rt6i_idev = in6_dev_get(loopback_dev);
174 in6_dev_put(rt_idev);
175 }
176
177 if (rt_dev == dev) {
178 rt->dst.dev = loopback_dev;
179 dev_hold(rt->dst.dev);
180 dev_put(rt_dev);
181 }
182 }
183 spin_unlock_bh(&ul->lock);
184 }
185}
186
187static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188{
189 return dst_metrics_write_ptr(&rt->from->dst);
190}
191
192static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193{
194 struct rt6_info *rt = (struct rt6_info *)dst;
195
196 if (rt->rt6i_flags & RTF_PCPU)
197 return rt6_pcpu_cow_metrics(rt);
198 else if (rt->rt6i_flags & RTF_CACHE)
199 return NULL;
200 else
201 return dst_cow_metrics_generic(dst, old);
202}
203
204static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205 struct sk_buff *skb,
206 const void *daddr)
207{
208 struct in6_addr *p = &rt->rt6i_gateway;
209
210 if (!ipv6_addr_any(p))
211 return (const void *) p;
212 else if (skb)
213 return &ipv6_hdr(skb)->daddr;
214 return daddr;
215}
216
217static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218 struct sk_buff *skb,
219 const void *daddr)
220{
221 struct rt6_info *rt = (struct rt6_info *) dst;
222 struct neighbour *n;
223
224 daddr = choose_neigh_daddr(rt, skb, daddr);
225 n = __ipv6_neigh_lookup(dst->dev, daddr);
226 if (n)
227 return n;
228 return neigh_create(&nd_tbl, daddr, dst->dev);
229}
230
231static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232{
233 struct net_device *dev = dst->dev;
234 struct rt6_info *rt = (struct rt6_info *)dst;
235
236 daddr = choose_neigh_daddr(rt, NULL, daddr);
237 if (!daddr)
238 return;
239 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240 return;
241 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242 return;
243 __ipv6_confirm_neigh(dev, daddr);
244}
245
246static struct dst_ops ip6_dst_ops_template = {
247 .family = AF_INET6,
248 .gc = ip6_dst_gc,
249 .gc_thresh = 1024,
250 .check = ip6_dst_check,
251 .default_advmss = ip6_default_advmss,
252 .mtu = ip6_mtu,
253 .cow_metrics = ipv6_cow_metrics,
254 .destroy = ip6_dst_destroy,
255 .ifdown = ip6_dst_ifdown,
256 .negative_advice = ip6_negative_advice,
257 .link_failure = ip6_link_failure,
258 .update_pmtu = ip6_rt_update_pmtu,
259 .redirect = rt6_do_redirect,
260 .local_out = __ip6_local_out,
261 .neigh_lookup = ip6_neigh_lookup,
262 .confirm_neigh = ip6_confirm_neigh,
263};
264
265static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266{
267 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269 return mtu ? : dst->dev->mtu;
270}
271
272static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273 struct sk_buff *skb, u32 mtu)
274{
275}
276
277static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 struct sk_buff *skb)
279{
280}
281
282static struct dst_ops ip6_dst_blackhole_ops = {
283 .family = AF_INET6,
284 .destroy = ip6_dst_destroy,
285 .check = ip6_dst_check,
286 .mtu = ip6_blackhole_mtu,
287 .default_advmss = ip6_default_advmss,
288 .update_pmtu = ip6_rt_blackhole_update_pmtu,
289 .redirect = ip6_rt_blackhole_redirect,
290 .cow_metrics = dst_cow_metrics_generic,
291 .neigh_lookup = ip6_neigh_lookup,
292};
293
294static const u32 ip6_template_metrics[RTAX_MAX] = {
295 [RTAX_HOPLIMIT - 1] = 0,
296};
297
298static const struct rt6_info ip6_null_entry_template = {
299 .dst = {
300 .__refcnt = ATOMIC_INIT(1),
301 .__use = 1,
302 .obsolete = DST_OBSOLETE_FORCE_CHK,
303 .error = -ENETUNREACH,
304 .input = ip6_pkt_discard,
305 .output = ip6_pkt_discard_out,
306 },
307 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
308 .rt6i_protocol = RTPROT_KERNEL,
309 .rt6i_metric = ~(u32) 0,
310 .rt6i_ref = ATOMIC_INIT(1),
311};
312
313#ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315static const struct rt6_info ip6_prohibit_entry_template = {
316 .dst = {
317 .__refcnt = ATOMIC_INIT(1),
318 .__use = 1,
319 .obsolete = DST_OBSOLETE_FORCE_CHK,
320 .error = -EACCES,
321 .input = ip6_pkt_prohibit,
322 .output = ip6_pkt_prohibit_out,
323 },
324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
325 .rt6i_protocol = RTPROT_KERNEL,
326 .rt6i_metric = ~(u32) 0,
327 .rt6i_ref = ATOMIC_INIT(1),
328};
329
330static const struct rt6_info ip6_blk_hole_entry_template = {
331 .dst = {
332 .__refcnt = ATOMIC_INIT(1),
333 .__use = 1,
334 .obsolete = DST_OBSOLETE_FORCE_CHK,
335 .error = -EINVAL,
336 .input = dst_discard,
337 .output = dst_discard_out,
338 },
339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
340 .rt6i_protocol = RTPROT_KERNEL,
341 .rt6i_metric = ~(u32) 0,
342 .rt6i_ref = ATOMIC_INIT(1),
343};
344
345#endif
346
347static void rt6_info_init(struct rt6_info *rt)
348{
349 struct dst_entry *dst = &rt->dst;
350
351 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 INIT_LIST_HEAD(&rt->rt6i_siblings);
353 INIT_LIST_HEAD(&rt->rt6i_uncached);
354}
355
356/* allocate dst with ip6_dst_ops */
357static struct rt6_info *__ip6_dst_alloc(struct net *net,
358 struct net_device *dev,
359 int flags)
360{
361 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362 1, DST_OBSOLETE_FORCE_CHK, flags);
363
364 if (rt) {
365 rt6_info_init(rt);
366 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367 }
368
369 return rt;
370}
371
372struct rt6_info *ip6_dst_alloc(struct net *net,
373 struct net_device *dev,
374 int flags)
375{
376 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378 if (rt) {
379 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380 if (!rt->rt6i_pcpu) {
381 dst_release_immediate(&rt->dst);
382 return NULL;
383 }
384 }
385
386 return rt;
387}
388EXPORT_SYMBOL(ip6_dst_alloc);
389
390static void ip6_dst_destroy(struct dst_entry *dst)
391{
392 struct rt6_info *rt = (struct rt6_info *)dst;
393 struct rt6_exception_bucket *bucket;
394 struct rt6_info *from = rt->from;
395 struct inet6_dev *idev;
396
397 dst_destroy_metrics_generic(dst);
398 free_percpu(rt->rt6i_pcpu);
399 rt6_uncached_list_del(rt);
400
401 idev = rt->rt6i_idev;
402 if (idev) {
403 rt->rt6i_idev = NULL;
404 in6_dev_put(idev);
405 }
406 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407 if (bucket) {
408 rt->rt6i_exception_bucket = NULL;
409 kfree(bucket);
410 }
411
412 rt->from = NULL;
413 dst_release(&from->dst);
414}
415
416static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417 int how)
418{
419 struct rt6_info *rt = (struct rt6_info *)dst;
420 struct inet6_dev *idev = rt->rt6i_idev;
421 struct net_device *loopback_dev =
422 dev_net(dev)->loopback_dev;
423
424 if (idev && idev->dev != loopback_dev) {
425 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426 if (loopback_idev) {
427 rt->rt6i_idev = loopback_idev;
428 in6_dev_put(idev);
429 }
430 }
431}
432
433static bool __rt6_check_expired(const struct rt6_info *rt)
434{
435 if (rt->rt6i_flags & RTF_EXPIRES)
436 return time_after(jiffies, rt->dst.expires);
437 else
438 return false;
439}
440
441static bool rt6_check_expired(const struct rt6_info *rt)
442{
443 if (rt->rt6i_flags & RTF_EXPIRES) {
444 if (time_after(jiffies, rt->dst.expires))
445 return true;
446 } else if (rt->from) {
447 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448 rt6_check_expired(rt->from);
449 }
450 return false;
451}
452
453static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454 struct flowi6 *fl6, int oif,
455 int strict)
456{
457 struct rt6_info *sibling, *next_sibling;
458
459 /* We might have already computed the hash for ICMPv6 errors. In such
460 * case it will always be non-zero. Otherwise now is the time to do it.
461 */
462 if (!fl6->mp_hash)
463 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
464
465 if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
466 return match;
467
468 list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
469 rt6i_siblings) {
470 if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
471 continue;
472 if (rt6_score_route(sibling, oif, strict) < 0)
473 break;
474 match = sibling;
475 break;
476 }
477
478 return match;
479}
480
481/*
482 * Route lookup. rcu_read_lock() should be held.
483 */
484
485static inline struct rt6_info *rt6_device_match(struct net *net,
486 struct rt6_info *rt,
487 const struct in6_addr *saddr,
488 int oif,
489 int flags)
490{
491 struct rt6_info *local = NULL;
492 struct rt6_info *sprt;
493
494 if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
495 return rt;
496
497 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
498 struct net_device *dev = sprt->dst.dev;
499
500 if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
501 continue;
502
503 if (oif) {
504 if (dev->ifindex == oif)
505 return sprt;
506 if (dev->flags & IFF_LOOPBACK) {
507 if (!sprt->rt6i_idev ||
508 sprt->rt6i_idev->dev->ifindex != oif) {
509 if (flags & RT6_LOOKUP_F_IFACE)
510 continue;
511 if (local &&
512 local->rt6i_idev->dev->ifindex == oif)
513 continue;
514 }
515 local = sprt;
516 }
517 } else {
518 if (ipv6_chk_addr(net, saddr, dev,
519 flags & RT6_LOOKUP_F_IFACE))
520 return sprt;
521 }
522 }
523
524 if (oif) {
525 if (local)
526 return local;
527
528 if (flags & RT6_LOOKUP_F_IFACE)
529 return net->ipv6.ip6_null_entry;
530 }
531
532 return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
533}
534
535#ifdef CONFIG_IPV6_ROUTER_PREF
536struct __rt6_probe_work {
537 struct work_struct work;
538 struct in6_addr target;
539 struct net_device *dev;
540};
541
542static void rt6_probe_deferred(struct work_struct *w)
543{
544 struct in6_addr mcaddr;
545 struct __rt6_probe_work *work =
546 container_of(w, struct __rt6_probe_work, work);
547
548 addrconf_addr_solict_mult(&work->target, &mcaddr);
549 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
550 dev_put(work->dev);
551 kfree(work);
552}
553
554static void rt6_probe(struct rt6_info *rt)
555{
556 struct __rt6_probe_work *work;
557 struct neighbour *neigh;
558 /*
559 * Okay, this does not seem to be appropriate
560 * for now, however, we need to check if it
561 * is really so; aka Router Reachability Probing.
562 *
563 * Router Reachability Probe MUST be rate-limited
564 * to no more than one per minute.
565 */
566 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
567 return;
568 rcu_read_lock_bh();
569 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
570 if (neigh) {
571 if (neigh->nud_state & NUD_VALID)
572 goto out;
573
574 work = NULL;
575 write_lock(&neigh->lock);
576 if (!(neigh->nud_state & NUD_VALID) &&
577 time_after(jiffies,
578 neigh->updated +
579 rt->rt6i_idev->cnf.rtr_probe_interval)) {
580 work = kmalloc(sizeof(*work), GFP_ATOMIC);
581 if (work)
582 __neigh_set_probe_once(neigh);
583 }
584 write_unlock(&neigh->lock);
585 } else {
586 work = kmalloc(sizeof(*work), GFP_ATOMIC);
587 }
588
589 if (work) {
590 INIT_WORK(&work->work, rt6_probe_deferred);
591 work->target = rt->rt6i_gateway;
592 dev_hold(rt->dst.dev);
593 work->dev = rt->dst.dev;
594 schedule_work(&work->work);
595 }
596
597out:
598 rcu_read_unlock_bh();
599}
600#else
601static inline void rt6_probe(struct rt6_info *rt)
602{
603}
604#endif
605
606/*
607 * Default Router Selection (RFC 2461 6.3.6)
608 */
609static inline int rt6_check_dev(struct rt6_info *rt, int oif)
610{
611 struct net_device *dev = rt->dst.dev;
612 if (!oif || dev->ifindex == oif)
613 return 2;
614 if ((dev->flags & IFF_LOOPBACK) &&
615 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
616 return 1;
617 return 0;
618}
619
620static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
621{
622 struct neighbour *neigh;
623 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
624
625 if (rt->rt6i_flags & RTF_NONEXTHOP ||
626 !(rt->rt6i_flags & RTF_GATEWAY))
627 return RT6_NUD_SUCCEED;
628
629 rcu_read_lock_bh();
630 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
631 if (neigh) {
632 read_lock(&neigh->lock);
633 if (neigh->nud_state & NUD_VALID)
634 ret = RT6_NUD_SUCCEED;
635#ifdef CONFIG_IPV6_ROUTER_PREF
636 else if (!(neigh->nud_state & NUD_FAILED))
637 ret = RT6_NUD_SUCCEED;
638 else
639 ret = RT6_NUD_FAIL_PROBE;
640#endif
641 read_unlock(&neigh->lock);
642 } else {
643 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
644 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
645 }
646 rcu_read_unlock_bh();
647
648 return ret;
649}
650
651static int rt6_score_route(struct rt6_info *rt, int oif,
652 int strict)
653{
654 int m;
655
656 m = rt6_check_dev(rt, oif);
657 if (!m && (strict & RT6_LOOKUP_F_IFACE))
658 return RT6_NUD_FAIL_HARD;
659#ifdef CONFIG_IPV6_ROUTER_PREF
660 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
661#endif
662 if (strict & RT6_LOOKUP_F_REACHABLE) {
663 int n = rt6_check_neigh(rt);
664 if (n < 0)
665 return n;
666 }
667 return m;
668}
669
670static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
671 int *mpri, struct rt6_info *match,
672 bool *do_rr)
673{
674 int m;
675 bool match_do_rr = false;
676 struct inet6_dev *idev = rt->rt6i_idev;
677
678 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
679 goto out;
680
681 if (idev->cnf.ignore_routes_with_linkdown &&
682 rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
683 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
684 goto out;
685
686 if (rt6_check_expired(rt))
687 goto out;
688
689 m = rt6_score_route(rt, oif, strict);
690 if (m == RT6_NUD_FAIL_DO_RR) {
691 match_do_rr = true;
692 m = 0; /* lowest valid score */
693 } else if (m == RT6_NUD_FAIL_HARD) {
694 goto out;
695 }
696
697 if (strict & RT6_LOOKUP_F_REACHABLE)
698 rt6_probe(rt);
699
700 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
701 if (m > *mpri) {
702 *do_rr = match_do_rr;
703 *mpri = m;
704 match = rt;
705 }
706out:
707 return match;
708}
709
710static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
711 struct rt6_info *leaf,
712 struct rt6_info *rr_head,
713 u32 metric, int oif, int strict,
714 bool *do_rr)
715{
716 struct rt6_info *rt, *match, *cont;
717 int mpri = -1;
718
719 match = NULL;
720 cont = NULL;
721 for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
722 if (rt->rt6i_metric != metric) {
723 cont = rt;
724 break;
725 }
726
727 match = find_match(rt, oif, strict, &mpri, match, do_rr);
728 }
729
730 for (rt = leaf; rt && rt != rr_head;
731 rt = rcu_dereference(rt->rt6_next)) {
732 if (rt->rt6i_metric != metric) {
733 cont = rt;
734 break;
735 }
736
737 match = find_match(rt, oif, strict, &mpri, match, do_rr);
738 }
739
740 if (match || !cont)
741 return match;
742
743 for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
744 match = find_match(rt, oif, strict, &mpri, match, do_rr);
745
746 return match;
747}
748
749static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
750 int oif, int strict)
751{
752 struct rt6_info *leaf = rcu_dereference(fn->leaf);
753 struct rt6_info *match, *rt0;
754 bool do_rr = false;
755 int key_plen;
756
757 if (!leaf || leaf == net->ipv6.ip6_null_entry)
758 return net->ipv6.ip6_null_entry;
759
760 rt0 = rcu_dereference(fn->rr_ptr);
761 if (!rt0)
762 rt0 = leaf;
763
764 /* Double check to make sure fn is not an intermediate node
765 * and fn->leaf does not points to its child's leaf
766 * (This might happen if all routes under fn are deleted from
767 * the tree and fib6_repair_tree() is called on the node.)
768 */
769 key_plen = rt0->rt6i_dst.plen;
770#ifdef CONFIG_IPV6_SUBTREES
771 if (rt0->rt6i_src.plen)
772 key_plen = rt0->rt6i_src.plen;
773#endif
774 if (fn->fn_bit != key_plen)
775 return net->ipv6.ip6_null_entry;
776
777 match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
778 &do_rr);
779
780 if (do_rr) {
781 struct rt6_info *next = rcu_dereference(rt0->rt6_next);
782
783 /* no entries matched; do round-robin */
784 if (!next || next->rt6i_metric != rt0->rt6i_metric)
785 next = leaf;
786
787 if (next != rt0) {
788 spin_lock_bh(&leaf->rt6i_table->tb6_lock);
789 /* make sure next is not being deleted from the tree */
790 if (next->rt6i_node)
791 rcu_assign_pointer(fn->rr_ptr, next);
792 spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
793 }
794 }
795
796 return match ? match : net->ipv6.ip6_null_entry;
797}
798
799static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
800{
801 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
802}
803
804#ifdef CONFIG_IPV6_ROUTE_INFO
805int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
806 const struct in6_addr *gwaddr)
807{
808 struct net *net = dev_net(dev);
809 struct route_info *rinfo = (struct route_info *) opt;
810 struct in6_addr prefix_buf, *prefix;
811 unsigned int pref;
812 unsigned long lifetime;
813 struct rt6_info *rt;
814
815 if (len < sizeof(struct route_info)) {
816 return -EINVAL;
817 }
818
819 /* Sanity check for prefix_len and length */
820 if (rinfo->length > 3) {
821 return -EINVAL;
822 } else if (rinfo->prefix_len > 128) {
823 return -EINVAL;
824 } else if (rinfo->prefix_len > 64) {
825 if (rinfo->length < 2) {
826 return -EINVAL;
827 }
828 } else if (rinfo->prefix_len > 0) {
829 if (rinfo->length < 1) {
830 return -EINVAL;
831 }
832 }
833
834 pref = rinfo->route_pref;
835 if (pref == ICMPV6_ROUTER_PREF_INVALID)
836 return -EINVAL;
837
838 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
839
840 if (rinfo->length == 3)
841 prefix = (struct in6_addr *)rinfo->prefix;
842 else {
843 /* this function is safe */
844 ipv6_addr_prefix(&prefix_buf,
845 (struct in6_addr *)rinfo->prefix,
846 rinfo->prefix_len);
847 prefix = &prefix_buf;
848 }
849
850 if (rinfo->prefix_len == 0)
851 rt = rt6_get_dflt_router(gwaddr, dev);
852 else
853 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
854 gwaddr, dev);
855
856 if (rt && !lifetime) {
857 ip6_del_rt(rt);
858 rt = NULL;
859 }
860
861 if (!rt && lifetime)
862 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
863 dev, pref);
864 else if (rt)
865 rt->rt6i_flags = RTF_ROUTEINFO |
866 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
867
868 if (rt) {
869 if (!addrconf_finite_timeout(lifetime))
870 rt6_clean_expires(rt);
871 else
872 rt6_set_expires(rt, jiffies + HZ * lifetime);
873
874 ip6_rt_put(rt);
875 }
876 return 0;
877}
878#endif
879
880static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
881 struct in6_addr *saddr)
882{
883 struct fib6_node *pn, *sn;
884 while (1) {
885 if (fn->fn_flags & RTN_TL_ROOT)
886 return NULL;
887 pn = rcu_dereference(fn->parent);
888 sn = FIB6_SUBTREE(pn);
889 if (sn && sn != fn)
890 fn = fib6_lookup(sn, NULL, saddr);
891 else
892 fn = pn;
893 if (fn->fn_flags & RTN_RTINFO)
894 return fn;
895 }
896}
897
898static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
899 bool null_fallback)
900{
901 struct rt6_info *rt = *prt;
902
903 if (dst_hold_safe(&rt->dst))
904 return true;
905 if (null_fallback) {
906 rt = net->ipv6.ip6_null_entry;
907 dst_hold(&rt->dst);
908 } else {
909 rt = NULL;
910 }
911 *prt = rt;
912 return false;
913}
914
915static struct rt6_info *ip6_pol_route_lookup(struct net *net,
916 struct fib6_table *table,
917 struct flowi6 *fl6, int flags)
918{
919 struct rt6_info *rt, *rt_cache;
920 struct fib6_node *fn;
921
922 rcu_read_lock();
923 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
924restart:
925 rt = rcu_dereference(fn->leaf);
926 if (!rt) {
927 rt = net->ipv6.ip6_null_entry;
928 } else {
929 rt = rt6_device_match(net, rt, &fl6->saddr,
930 fl6->flowi6_oif, flags);
931 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
932 rt = rt6_multipath_select(rt, fl6,
933 fl6->flowi6_oif, flags);
934 }
935 if (rt == net->ipv6.ip6_null_entry) {
936 fn = fib6_backtrack(fn, &fl6->saddr);
937 if (fn)
938 goto restart;
939 }
940 /* Search through exception table */
941 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
942 if (rt_cache)
943 rt = rt_cache;
944
945 if (ip6_hold_safe(net, &rt, true))
946 dst_use_noref(&rt->dst, jiffies);
947
948 rcu_read_unlock();
949
950 trace_fib6_table_lookup(net, rt, table, fl6);
951
952 return rt;
953
954}
955
956struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
957 int flags)
958{
959 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
960}
961EXPORT_SYMBOL_GPL(ip6_route_lookup);
962
963struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
964 const struct in6_addr *saddr, int oif, int strict)
965{
966 struct flowi6 fl6 = {
967 .flowi6_oif = oif,
968 .daddr = *daddr,
969 };
970 struct dst_entry *dst;
971 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
972
973 if (saddr) {
974 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
975 flags |= RT6_LOOKUP_F_HAS_SADDR;
976 }
977
978 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
979 if (dst->error == 0)
980 return (struct rt6_info *) dst;
981
982 dst_release(dst);
983
984 return NULL;
985}
986EXPORT_SYMBOL(rt6_lookup);
987
988/* ip6_ins_rt is called with FREE table->tb6_lock.
989 * It takes new route entry, the addition fails by any reason the
990 * route is released.
991 * Caller must hold dst before calling it.
992 */
993
994static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
995 struct mx6_config *mxc,
996 struct netlink_ext_ack *extack)
997{
998 int err;
999 struct fib6_table *table;
1000
1001 table = rt->rt6i_table;
1002 spin_lock_bh(&table->tb6_lock);
1003 err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1004 spin_unlock_bh(&table->tb6_lock);
1005
1006 return err;
1007}
1008
1009int ip6_ins_rt(struct rt6_info *rt)
1010{
1011 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1012 struct mx6_config mxc = { .mx = NULL, };
1013
1014 /* Hold dst to account for the reference from the fib6 tree */
1015 dst_hold(&rt->dst);
1016 return __ip6_ins_rt(rt, &info, &mxc, NULL);
1017}
1018
1019/* called with rcu_lock held */
1020static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1021{
1022 struct net_device *dev = rt->dst.dev;
1023
1024 if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1025 /* for copies of local routes, dst->dev needs to be the
1026 * device if it is a master device, the master device if
1027 * device is enslaved, and the loopback as the default
1028 */
1029 if (netif_is_l3_slave(dev) &&
1030 !rt6_need_strict(&rt->rt6i_dst.addr))
1031 dev = l3mdev_master_dev_rcu(dev);
1032 else if (!netif_is_l3_master(dev))
1033 dev = dev_net(dev)->loopback_dev;
1034 /* last case is netif_is_l3_master(dev) is true in which
1035 * case we want dev returned to be dev
1036 */
1037 }
1038
1039 return dev;
1040}
1041
1042static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1043 const struct in6_addr *daddr,
1044 const struct in6_addr *saddr)
1045{
1046 struct net_device *dev;
1047 struct rt6_info *rt;
1048
1049 /*
1050 * Clone the route.
1051 */
1052
1053 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1054 ort = ort->from;
1055
1056 rcu_read_lock();
1057 dev = ip6_rt_get_dev_rcu(ort);
1058 rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1059 rcu_read_unlock();
1060 if (!rt)
1061 return NULL;
1062
1063 ip6_rt_copy_init(rt, ort);
1064 rt->rt6i_flags |= RTF_CACHE;
1065 rt->rt6i_metric = 0;
1066 rt->dst.flags |= DST_HOST;
1067 rt->rt6i_dst.addr = *daddr;
1068 rt->rt6i_dst.plen = 128;
1069
1070 if (!rt6_is_gw_or_nonexthop(ort)) {
1071 if (ort->rt6i_dst.plen != 128 &&
1072 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1073 rt->rt6i_flags |= RTF_ANYCAST;
1074#ifdef CONFIG_IPV6_SUBTREES
1075 if (rt->rt6i_src.plen && saddr) {
1076 rt->rt6i_src.addr = *saddr;
1077 rt->rt6i_src.plen = 128;
1078 }
1079#endif
1080 }
1081
1082 return rt;
1083}
1084
1085static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1086{
1087 struct net_device *dev;
1088 struct rt6_info *pcpu_rt;
1089
1090 rcu_read_lock();
1091 dev = ip6_rt_get_dev_rcu(rt);
1092 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1093 rcu_read_unlock();
1094 if (!pcpu_rt)
1095 return NULL;
1096 ip6_rt_copy_init(pcpu_rt, rt);
1097 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1098 pcpu_rt->rt6i_flags |= RTF_PCPU;
1099 return pcpu_rt;
1100}
1101
1102/* It should be called with rcu_read_lock() acquired */
1103static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1104{
1105 struct rt6_info *pcpu_rt, **p;
1106
1107 p = this_cpu_ptr(rt->rt6i_pcpu);
1108 pcpu_rt = *p;
1109
1110 if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1111 rt6_dst_from_metrics_check(pcpu_rt);
1112
1113 return pcpu_rt;
1114}
1115
1116static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1117{
1118 struct rt6_info *pcpu_rt, *prev, **p;
1119
1120 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1121 if (!pcpu_rt) {
1122 struct net *net = dev_net(rt->dst.dev);
1123
1124 dst_hold(&net->ipv6.ip6_null_entry->dst);
1125 return net->ipv6.ip6_null_entry;
1126 }
1127
1128 dst_hold(&pcpu_rt->dst);
1129 p = this_cpu_ptr(rt->rt6i_pcpu);
1130 prev = cmpxchg(p, NULL, pcpu_rt);
1131 BUG_ON(prev);
1132
1133 rt6_dst_from_metrics_check(pcpu_rt);
1134 return pcpu_rt;
1135}
1136
1137/* exception hash table implementation
1138 */
1139static DEFINE_SPINLOCK(rt6_exception_lock);
1140
1141/* Remove rt6_ex from hash table and free the memory
1142 * Caller must hold rt6_exception_lock
1143 */
1144static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1145 struct rt6_exception *rt6_ex)
1146{
1147 struct net *net;
1148
1149 if (!bucket || !rt6_ex)
1150 return;
1151
1152 net = dev_net(rt6_ex->rt6i->dst.dev);
1153 rt6_ex->rt6i->rt6i_node = NULL;
1154 hlist_del_rcu(&rt6_ex->hlist);
1155 rt6_release(rt6_ex->rt6i);
1156 kfree_rcu(rt6_ex, rcu);
1157 WARN_ON_ONCE(!bucket->depth);
1158 bucket->depth--;
1159 net->ipv6.rt6_stats->fib_rt_cache--;
1160}
1161
1162/* Remove oldest rt6_ex in bucket and free the memory
1163 * Caller must hold rt6_exception_lock
1164 */
1165static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1166{
1167 struct rt6_exception *rt6_ex, *oldest = NULL;
1168
1169 if (!bucket)
1170 return;
1171
1172 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1173 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1174 oldest = rt6_ex;
1175 }
1176 rt6_remove_exception(bucket, oldest);
1177}
1178
1179static u32 rt6_exception_hash(const struct in6_addr *dst,
1180 const struct in6_addr *src)
1181{
1182 static u32 seed __read_mostly;
1183 u32 val;
1184
1185 net_get_random_once(&seed, sizeof(seed));
1186 val = jhash(dst, sizeof(*dst), seed);
1187
1188#ifdef CONFIG_IPV6_SUBTREES
1189 if (src)
1190 val = jhash(src, sizeof(*src), val);
1191#endif
1192 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1193}
1194
1195/* Helper function to find the cached rt in the hash table
1196 * and update bucket pointer to point to the bucket for this
1197 * (daddr, saddr) pair
1198 * Caller must hold rt6_exception_lock
1199 */
1200static struct rt6_exception *
1201__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1202 const struct in6_addr *daddr,
1203 const struct in6_addr *saddr)
1204{
1205 struct rt6_exception *rt6_ex;
1206 u32 hval;
1207
1208 if (!(*bucket) || !daddr)
1209 return NULL;
1210
1211 hval = rt6_exception_hash(daddr, saddr);
1212 *bucket += hval;
1213
1214 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1215 struct rt6_info *rt6 = rt6_ex->rt6i;
1216 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1217
1218#ifdef CONFIG_IPV6_SUBTREES
1219 if (matched && saddr)
1220 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1221#endif
1222 if (matched)
1223 return rt6_ex;
1224 }
1225 return NULL;
1226}
1227
1228/* Helper function to find the cached rt in the hash table
1229 * and update bucket pointer to point to the bucket for this
1230 * (daddr, saddr) pair
1231 * Caller must hold rcu_read_lock()
1232 */
1233static struct rt6_exception *
1234__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1235 const struct in6_addr *daddr,
1236 const struct in6_addr *saddr)
1237{
1238 struct rt6_exception *rt6_ex;
1239 u32 hval;
1240
1241 WARN_ON_ONCE(!rcu_read_lock_held());
1242
1243 if (!(*bucket) || !daddr)
1244 return NULL;
1245
1246 hval = rt6_exception_hash(daddr, saddr);
1247 *bucket += hval;
1248
1249 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1250 struct rt6_info *rt6 = rt6_ex->rt6i;
1251 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1252
1253#ifdef CONFIG_IPV6_SUBTREES
1254 if (matched && saddr)
1255 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1256#endif
1257 if (matched)
1258 return rt6_ex;
1259 }
1260 return NULL;
1261}
1262
1263static int rt6_insert_exception(struct rt6_info *nrt,
1264 struct rt6_info *ort)
1265{
1266 struct net *net = dev_net(ort->dst.dev);
1267 struct rt6_exception_bucket *bucket;
1268 struct in6_addr *src_key = NULL;
1269 struct rt6_exception *rt6_ex;
1270 int err = 0;
1271
1272 /* ort can't be a cache or pcpu route */
1273 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1274 ort = ort->from;
1275 WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1276
1277 spin_lock_bh(&rt6_exception_lock);
1278
1279 if (ort->exception_bucket_flushed) {
1280 err = -EINVAL;
1281 goto out;
1282 }
1283
1284 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1285 lockdep_is_held(&rt6_exception_lock));
1286 if (!bucket) {
1287 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1288 GFP_ATOMIC);
1289 if (!bucket) {
1290 err = -ENOMEM;
1291 goto out;
1292 }
1293 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1294 }
1295
1296#ifdef CONFIG_IPV6_SUBTREES
1297 /* rt6i_src.plen != 0 indicates ort is in subtree
1298 * and exception table is indexed by a hash of
1299 * both rt6i_dst and rt6i_src.
1300 * Otherwise, the exception table is indexed by
1301 * a hash of only rt6i_dst.
1302 */
1303 if (ort->rt6i_src.plen)
1304 src_key = &nrt->rt6i_src.addr;
1305#endif
1306
1307 /* Update rt6i_prefsrc as it could be changed
1308 * in rt6_remove_prefsrc()
1309 */
1310 nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1311 /* rt6_mtu_change() might lower mtu on ort.
1312 * Only insert this exception route if its mtu
1313 * is less than ort's mtu value.
1314 */
1315 if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1316 err = -EINVAL;
1317 goto out;
1318 }
1319
1320 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1321 src_key);
1322 if (rt6_ex)
1323 rt6_remove_exception(bucket, rt6_ex);
1324
1325 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1326 if (!rt6_ex) {
1327 err = -ENOMEM;
1328 goto out;
1329 }
1330 rt6_ex->rt6i = nrt;
1331 rt6_ex->stamp = jiffies;
1332 atomic_inc(&nrt->rt6i_ref);
1333 nrt->rt6i_node = ort->rt6i_node;
1334 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1335 bucket->depth++;
1336 net->ipv6.rt6_stats->fib_rt_cache++;
1337
1338 if (bucket->depth > FIB6_MAX_DEPTH)
1339 rt6_exception_remove_oldest(bucket);
1340
1341out:
1342 spin_unlock_bh(&rt6_exception_lock);
1343
1344 /* Update fn->fn_sernum to invalidate all cached dst */
1345 if (!err) {
1346 spin_lock_bh(&ort->rt6i_table->tb6_lock);
1347 fib6_update_sernum(ort);
1348 spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1349 fib6_force_start_gc(net);
1350 }
1351
1352 return err;
1353}
1354
1355void rt6_flush_exceptions(struct rt6_info *rt)
1356{
1357 struct rt6_exception_bucket *bucket;
1358 struct rt6_exception *rt6_ex;
1359 struct hlist_node *tmp;
1360 int i;
1361
1362 spin_lock_bh(&rt6_exception_lock);
1363 /* Prevent rt6_insert_exception() to recreate the bucket list */
1364 rt->exception_bucket_flushed = 1;
1365
1366 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1367 lockdep_is_held(&rt6_exception_lock));
1368 if (!bucket)
1369 goto out;
1370
1371 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1372 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1373 rt6_remove_exception(bucket, rt6_ex);
1374 WARN_ON_ONCE(bucket->depth);
1375 bucket++;
1376 }
1377
1378out:
1379 spin_unlock_bh(&rt6_exception_lock);
1380}
1381
1382/* Find cached rt in the hash table inside passed in rt
1383 * Caller has to hold rcu_read_lock()
1384 */
1385static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1386 struct in6_addr *daddr,
1387 struct in6_addr *saddr)
1388{
1389 struct rt6_exception_bucket *bucket;
1390 struct in6_addr *src_key = NULL;
1391 struct rt6_exception *rt6_ex;
1392 struct rt6_info *res = NULL;
1393
1394 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1395
1396#ifdef CONFIG_IPV6_SUBTREES
1397 /* rt6i_src.plen != 0 indicates rt is in subtree
1398 * and exception table is indexed by a hash of
1399 * both rt6i_dst and rt6i_src.
1400 * Otherwise, the exception table is indexed by
1401 * a hash of only rt6i_dst.
1402 */
1403 if (rt->rt6i_src.plen)
1404 src_key = saddr;
1405#endif
1406 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1407
1408 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1409 res = rt6_ex->rt6i;
1410
1411 return res;
1412}
1413
1414/* Remove the passed in cached rt from the hash table that contains it */
1415int rt6_remove_exception_rt(struct rt6_info *rt)
1416{
1417 struct rt6_exception_bucket *bucket;
1418 struct rt6_info *from = rt->from;
1419 struct in6_addr *src_key = NULL;
1420 struct rt6_exception *rt6_ex;
1421 int err;
1422
1423 if (!from ||
1424 !(rt->rt6i_flags & RTF_CACHE))
1425 return -EINVAL;
1426
1427 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1428 return -ENOENT;
1429
1430 spin_lock_bh(&rt6_exception_lock);
1431 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1432 lockdep_is_held(&rt6_exception_lock));
1433#ifdef CONFIG_IPV6_SUBTREES
1434 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1435 * and exception table is indexed by a hash of
1436 * both rt6i_dst and rt6i_src.
1437 * Otherwise, the exception table is indexed by
1438 * a hash of only rt6i_dst.
1439 */
1440 if (from->rt6i_src.plen)
1441 src_key = &rt->rt6i_src.addr;
1442#endif
1443 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1444 &rt->rt6i_dst.addr,
1445 src_key);
1446 if (rt6_ex) {
1447 rt6_remove_exception(bucket, rt6_ex);
1448 err = 0;
1449 } else {
1450 err = -ENOENT;
1451 }
1452
1453 spin_unlock_bh(&rt6_exception_lock);
1454 return err;
1455}
1456
1457/* Find rt6_ex which contains the passed in rt cache and
1458 * refresh its stamp
1459 */
1460static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1461{
1462 struct rt6_exception_bucket *bucket;
1463 struct rt6_info *from = rt->from;
1464 struct in6_addr *src_key = NULL;
1465 struct rt6_exception *rt6_ex;
1466
1467 if (!from ||
1468 !(rt->rt6i_flags & RTF_CACHE))
1469 return;
1470
1471 rcu_read_lock();
1472 bucket = rcu_dereference(from->rt6i_exception_bucket);
1473
1474#ifdef CONFIG_IPV6_SUBTREES
1475 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1476 * and exception table is indexed by a hash of
1477 * both rt6i_dst and rt6i_src.
1478 * Otherwise, the exception table is indexed by
1479 * a hash of only rt6i_dst.
1480 */
1481 if (from->rt6i_src.plen)
1482 src_key = &rt->rt6i_src.addr;
1483#endif
1484 rt6_ex = __rt6_find_exception_rcu(&bucket,
1485 &rt->rt6i_dst.addr,
1486 src_key);
1487 if (rt6_ex)
1488 rt6_ex->stamp = jiffies;
1489
1490 rcu_read_unlock();
1491}
1492
1493static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1494{
1495 struct rt6_exception_bucket *bucket;
1496 struct rt6_exception *rt6_ex;
1497 int i;
1498
1499 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1500 lockdep_is_held(&rt6_exception_lock));
1501
1502 if (bucket) {
1503 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1504 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1505 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1506 }
1507 bucket++;
1508 }
1509 }
1510}
1511
1512static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1513 struct rt6_info *rt, int mtu)
1514{
1515 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1516 * lowest MTU in the path: always allow updating the route PMTU to
1517 * reflect PMTU decreases.
1518 *
1519 * If the new MTU is higher, and the route PMTU is equal to the local
1520 * MTU, this means the old MTU is the lowest in the path, so allow
1521 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1522 * handle this.
1523 */
1524
1525 if (dst_mtu(&rt->dst) >= mtu)
1526 return true;
1527
1528 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1529 return true;
1530
1531 return false;
1532}
1533
1534static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1535 struct rt6_info *rt, int mtu)
1536{
1537 struct rt6_exception_bucket *bucket;
1538 struct rt6_exception *rt6_ex;
1539 int i;
1540
1541 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1542 lockdep_is_held(&rt6_exception_lock));
1543
1544 if (!bucket)
1545 return;
1546
1547 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1548 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1549 struct rt6_info *entry = rt6_ex->rt6i;
1550
1551 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1552 * route), the metrics of its rt->dst.from have already
1553 * been updated.
1554 */
1555 if (entry->rt6i_pmtu &&
1556 rt6_mtu_change_route_allowed(idev, entry, mtu))
1557 entry->rt6i_pmtu = mtu;
1558 }
1559 bucket++;
1560 }
1561}
1562
1563#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1564
1565static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1566 struct in6_addr *gateway)
1567{
1568 struct rt6_exception_bucket *bucket;
1569 struct rt6_exception *rt6_ex;
1570 struct hlist_node *tmp;
1571 int i;
1572
1573 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1574 return;
1575
1576 spin_lock_bh(&rt6_exception_lock);
1577 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1578 lockdep_is_held(&rt6_exception_lock));
1579
1580 if (bucket) {
1581 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1582 hlist_for_each_entry_safe(rt6_ex, tmp,
1583 &bucket->chain, hlist) {
1584 struct rt6_info *entry = rt6_ex->rt6i;
1585
1586 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1587 RTF_CACHE_GATEWAY &&
1588 ipv6_addr_equal(gateway,
1589 &entry->rt6i_gateway)) {
1590 rt6_remove_exception(bucket, rt6_ex);
1591 }
1592 }
1593 bucket++;
1594 }
1595 }
1596
1597 spin_unlock_bh(&rt6_exception_lock);
1598}
1599
1600static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1601 struct rt6_exception *rt6_ex,
1602 struct fib6_gc_args *gc_args,
1603 unsigned long now)
1604{
1605 struct rt6_info *rt = rt6_ex->rt6i;
1606
1607 /* we are pruning and obsoleting aged-out and non gateway exceptions
1608 * even if others have still references to them, so that on next
1609 * dst_check() such references can be dropped.
1610 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1611 * expired, independently from their aging, as per RFC 8201 section 4
1612 */
1613 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1614 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1615 RT6_TRACE("aging clone %p\n", rt);
1616 rt6_remove_exception(bucket, rt6_ex);
1617 return;
1618 }
1619 } else if (time_after(jiffies, rt->dst.expires)) {
1620 RT6_TRACE("purging expired route %p\n", rt);
1621 rt6_remove_exception(bucket, rt6_ex);
1622 return;
1623 }
1624
1625 if (rt->rt6i_flags & RTF_GATEWAY) {
1626 struct neighbour *neigh;
1627 __u8 neigh_flags = 0;
1628
1629 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1630 if (neigh) {
1631 neigh_flags = neigh->flags;
1632 neigh_release(neigh);
1633 }
1634 if (!(neigh_flags & NTF_ROUTER)) {
1635 RT6_TRACE("purging route %p via non-router but gateway\n",
1636 rt);
1637 rt6_remove_exception(bucket, rt6_ex);
1638 return;
1639 }
1640 }
1641
1642 gc_args->more++;
1643}
1644
1645void rt6_age_exceptions(struct rt6_info *rt,
1646 struct fib6_gc_args *gc_args,
1647 unsigned long now)
1648{
1649 struct rt6_exception_bucket *bucket;
1650 struct rt6_exception *rt6_ex;
1651 struct hlist_node *tmp;
1652 int i;
1653
1654 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1655 return;
1656
1657 spin_lock_bh(&rt6_exception_lock);
1658 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1659 lockdep_is_held(&rt6_exception_lock));
1660
1661 if (bucket) {
1662 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1663 hlist_for_each_entry_safe(rt6_ex, tmp,
1664 &bucket->chain, hlist) {
1665 rt6_age_examine_exception(bucket, rt6_ex,
1666 gc_args, now);
1667 }
1668 bucket++;
1669 }
1670 }
1671 spin_unlock_bh(&rt6_exception_lock);
1672}
1673
1674struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1675 int oif, struct flowi6 *fl6, int flags)
1676{
1677 struct fib6_node *fn, *saved_fn;
1678 struct rt6_info *rt, *rt_cache;
1679 int strict = 0;
1680
1681 strict |= flags & RT6_LOOKUP_F_IFACE;
1682 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1683 if (net->ipv6.devconf_all->forwarding == 0)
1684 strict |= RT6_LOOKUP_F_REACHABLE;
1685
1686 rcu_read_lock();
1687
1688 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1689 saved_fn = fn;
1690
1691 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1692 oif = 0;
1693
1694redo_rt6_select:
1695 rt = rt6_select(net, fn, oif, strict);
1696 if (rt->rt6i_nsiblings)
1697 rt = rt6_multipath_select(rt, fl6, oif, strict);
1698 if (rt == net->ipv6.ip6_null_entry) {
1699 fn = fib6_backtrack(fn, &fl6->saddr);
1700 if (fn)
1701 goto redo_rt6_select;
1702 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1703 /* also consider unreachable route */
1704 strict &= ~RT6_LOOKUP_F_REACHABLE;
1705 fn = saved_fn;
1706 goto redo_rt6_select;
1707 }
1708 }
1709
1710 /*Search through exception table */
1711 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1712 if (rt_cache)
1713 rt = rt_cache;
1714
1715 if (rt == net->ipv6.ip6_null_entry) {
1716 rcu_read_unlock();
1717 dst_hold(&rt->dst);
1718 trace_fib6_table_lookup(net, rt, table, fl6);
1719 return rt;
1720 } else if (rt->rt6i_flags & RTF_CACHE) {
1721 if (ip6_hold_safe(net, &rt, true)) {
1722 dst_use_noref(&rt->dst, jiffies);
1723 rt6_dst_from_metrics_check(rt);
1724 }
1725 rcu_read_unlock();
1726 trace_fib6_table_lookup(net, rt, table, fl6);
1727 return rt;
1728 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1729 !(rt->rt6i_flags & RTF_GATEWAY))) {
1730 /* Create a RTF_CACHE clone which will not be
1731 * owned by the fib6 tree. It is for the special case where
1732 * the daddr in the skb during the neighbor look-up is different
1733 * from the fl6->daddr used to look-up route here.
1734 */
1735
1736 struct rt6_info *uncached_rt;
1737
1738 if (ip6_hold_safe(net, &rt, true)) {
1739 dst_use_noref(&rt->dst, jiffies);
1740 } else {
1741 rcu_read_unlock();
1742 uncached_rt = rt;
1743 goto uncached_rt_out;
1744 }
1745 rcu_read_unlock();
1746
1747 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1748 dst_release(&rt->dst);
1749
1750 if (uncached_rt) {
1751 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1752 * No need for another dst_hold()
1753 */
1754 rt6_uncached_list_add(uncached_rt);
1755 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1756 } else {
1757 uncached_rt = net->ipv6.ip6_null_entry;
1758 dst_hold(&uncached_rt->dst);
1759 }
1760
1761uncached_rt_out:
1762 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1763 return uncached_rt;
1764
1765 } else {
1766 /* Get a percpu copy */
1767
1768 struct rt6_info *pcpu_rt;
1769
1770 dst_use_noref(&rt->dst, jiffies);
1771 local_bh_disable();
1772 pcpu_rt = rt6_get_pcpu_route(rt);
1773
1774 if (!pcpu_rt) {
1775 /* atomic_inc_not_zero() is needed when using rcu */
1776 if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1777 /* No dst_hold() on rt is needed because grabbing
1778 * rt->rt6i_ref makes sure rt can't be released.
1779 */
1780 pcpu_rt = rt6_make_pcpu_route(rt);
1781 rt6_release(rt);
1782 } else {
1783 /* rt is already removed from tree */
1784 pcpu_rt = net->ipv6.ip6_null_entry;
1785 dst_hold(&pcpu_rt->dst);
1786 }
1787 }
1788 local_bh_enable();
1789 rcu_read_unlock();
1790 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1791 return pcpu_rt;
1792 }
1793}
1794EXPORT_SYMBOL_GPL(ip6_pol_route);
1795
1796static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1797 struct flowi6 *fl6, int flags)
1798{
1799 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1800}
1801
1802struct dst_entry *ip6_route_input_lookup(struct net *net,
1803 struct net_device *dev,
1804 struct flowi6 *fl6, int flags)
1805{
1806 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1807 flags |= RT6_LOOKUP_F_IFACE;
1808
1809 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1810}
1811EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1812
1813static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1814 struct flow_keys *keys)
1815{
1816 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1817 const struct ipv6hdr *key_iph = outer_iph;
1818 const struct ipv6hdr *inner_iph;
1819 const struct icmp6hdr *icmph;
1820 struct ipv6hdr _inner_iph;
1821
1822 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1823 goto out;
1824
1825 icmph = icmp6_hdr(skb);
1826 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1827 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1828 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1829 icmph->icmp6_type != ICMPV6_PARAMPROB)
1830 goto out;
1831
1832 inner_iph = skb_header_pointer(skb,
1833 skb_transport_offset(skb) + sizeof(*icmph),
1834 sizeof(_inner_iph), &_inner_iph);
1835 if (!inner_iph)
1836 goto out;
1837
1838 key_iph = inner_iph;
1839out:
1840 memset(keys, 0, sizeof(*keys));
1841 keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1842 keys->addrs.v6addrs.src = key_iph->saddr;
1843 keys->addrs.v6addrs.dst = key_iph->daddr;
1844 keys->tags.flow_label = ip6_flowinfo(key_iph);
1845 keys->basic.ip_proto = key_iph->nexthdr;
1846}
1847
1848/* if skb is set it will be used and fl6 can be NULL */
1849u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1850{
1851 struct flow_keys hash_keys;
1852
1853 if (skb) {
1854 ip6_multipath_l3_keys(skb, &hash_keys);
1855 return flow_hash_from_keys(&hash_keys) >> 1;
1856 }
1857
1858 return get_hash_from_flowi6(fl6) >> 1;
1859}
1860
1861void ip6_route_input(struct sk_buff *skb)
1862{
1863 const struct ipv6hdr *iph = ipv6_hdr(skb);
1864 struct net *net = dev_net(skb->dev);
1865 int flags = RT6_LOOKUP_F_HAS_SADDR;
1866 struct ip_tunnel_info *tun_info;
1867 struct flowi6 fl6 = {
1868 .flowi6_iif = skb->dev->ifindex,
1869 .daddr = iph->daddr,
1870 .saddr = iph->saddr,
1871 .flowlabel = ip6_flowinfo(iph),
1872 .flowi6_mark = skb->mark,
1873 .flowi6_proto = iph->nexthdr,
1874 };
1875
1876 tun_info = skb_tunnel_info(skb);
1877 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1878 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1879 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1880 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1881 skb_dst_drop(skb);
1882 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1883}
1884
1885static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1886 struct flowi6 *fl6, int flags)
1887{
1888 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1889}
1890
1891struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1892 struct flowi6 *fl6, int flags)
1893{
1894 bool any_src;
1895
1896 if (rt6_need_strict(&fl6->daddr)) {
1897 struct dst_entry *dst;
1898
1899 dst = l3mdev_link_scope_lookup(net, fl6);
1900 if (dst)
1901 return dst;
1902 }
1903
1904 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1905
1906 any_src = ipv6_addr_any(&fl6->saddr);
1907 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1908 (fl6->flowi6_oif && any_src))
1909 flags |= RT6_LOOKUP_F_IFACE;
1910
1911 if (!any_src)
1912 flags |= RT6_LOOKUP_F_HAS_SADDR;
1913 else if (sk)
1914 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1915
1916 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1917}
1918EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1919
1920struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1921{
1922 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1923 struct net_device *loopback_dev = net->loopback_dev;
1924 struct dst_entry *new = NULL;
1925
1926 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1927 DST_OBSOLETE_DEAD, 0);
1928 if (rt) {
1929 rt6_info_init(rt);
1930 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1931
1932 new = &rt->dst;
1933 new->__use = 1;
1934 new->input = dst_discard;
1935 new->output = dst_discard_out;
1936
1937 dst_copy_metrics(new, &ort->dst);
1938
1939 rt->rt6i_idev = in6_dev_get(loopback_dev);
1940 rt->rt6i_gateway = ort->rt6i_gateway;
1941 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1942 rt->rt6i_metric = 0;
1943
1944 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1945#ifdef CONFIG_IPV6_SUBTREES
1946 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1947#endif
1948 }
1949
1950 dst_release(dst_orig);
1951 return new ? new : ERR_PTR(-ENOMEM);
1952}
1953
1954/*
1955 * Destination cache support functions
1956 */
1957
1958static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1959{
1960 if (rt->from &&
1961 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
1962 dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
1963}
1964
1965static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1966{
1967 u32 rt_cookie = 0;
1968
1969 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1970 return NULL;
1971
1972 if (rt6_check_expired(rt))
1973 return NULL;
1974
1975 return &rt->dst;
1976}
1977
1978static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1979{
1980 if (!__rt6_check_expired(rt) &&
1981 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1982 rt6_check(rt->from, cookie))
1983 return &rt->dst;
1984 else
1985 return NULL;
1986}
1987
1988static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1989{
1990 struct rt6_info *rt;
1991
1992 rt = (struct rt6_info *) dst;
1993
1994 /* All IPV6 dsts are created with ->obsolete set to the value
1995 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1996 * into this function always.
1997 */
1998
1999 rt6_dst_from_metrics_check(rt);
2000
2001 if (rt->rt6i_flags & RTF_PCPU ||
2002 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2003 return rt6_dst_from_check(rt, cookie);
2004 else
2005 return rt6_check(rt, cookie);
2006}
2007
2008static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2009{
2010 struct rt6_info *rt = (struct rt6_info *) dst;
2011
2012 if (rt) {
2013 if (rt->rt6i_flags & RTF_CACHE) {
2014 if (rt6_check_expired(rt)) {
2015 ip6_del_rt(rt);
2016 dst = NULL;
2017 }
2018 } else {
2019 dst_release(dst);
2020 dst = NULL;
2021 }
2022 }
2023 return dst;
2024}
2025
2026static void ip6_link_failure(struct sk_buff *skb)
2027{
2028 struct rt6_info *rt;
2029
2030 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2031
2032 rt = (struct rt6_info *) skb_dst(skb);
2033 if (rt) {
2034 if (rt->rt6i_flags & RTF_CACHE) {
2035 if (dst_hold_safe(&rt->dst))
2036 ip6_del_rt(rt);
2037 } else {
2038 struct fib6_node *fn;
2039
2040 rcu_read_lock();
2041 fn = rcu_dereference(rt->rt6i_node);
2042 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2043 fn->fn_sernum = -1;
2044 rcu_read_unlock();
2045 }
2046 }
2047}
2048
2049static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2050{
2051 struct net *net = dev_net(rt->dst.dev);
2052
2053 rt->rt6i_flags |= RTF_MODIFIED;
2054 rt->rt6i_pmtu = mtu;
2055 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2056}
2057
2058static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2059{
2060 return !(rt->rt6i_flags & RTF_CACHE) &&
2061 (rt->rt6i_flags & RTF_PCPU ||
2062 rcu_access_pointer(rt->rt6i_node));
2063}
2064
2065static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2066 const struct ipv6hdr *iph, u32 mtu)
2067{
2068 const struct in6_addr *daddr, *saddr;
2069 struct rt6_info *rt6 = (struct rt6_info *)dst;
2070
2071 if (rt6->rt6i_flags & RTF_LOCAL)
2072 return;
2073
2074 if (dst_metric_locked(dst, RTAX_MTU))
2075 return;
2076
2077 if (iph) {
2078 daddr = &iph->daddr;
2079 saddr = &iph->saddr;
2080 } else if (sk) {
2081 daddr = &sk->sk_v6_daddr;
2082 saddr = &inet6_sk(sk)->saddr;
2083 } else {
2084 daddr = NULL;
2085 saddr = NULL;
2086 }
2087 dst_confirm_neigh(dst, daddr);
2088 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2089 if (mtu >= dst_mtu(dst))
2090 return;
2091
2092 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2093 rt6_do_update_pmtu(rt6, mtu);
2094 /* update rt6_ex->stamp for cache */
2095 if (rt6->rt6i_flags & RTF_CACHE)
2096 rt6_update_exception_stamp_rt(rt6);
2097 } else if (daddr) {
2098 struct rt6_info *nrt6;
2099
2100 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2101 if (nrt6) {
2102 rt6_do_update_pmtu(nrt6, mtu);
2103 if (rt6_insert_exception(nrt6, rt6))
2104 dst_release_immediate(&nrt6->dst);
2105 }
2106 }
2107}
2108
2109static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2110 struct sk_buff *skb, u32 mtu)
2111{
2112 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2113}
2114
2115void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2116 int oif, u32 mark, kuid_t uid)
2117{
2118 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2119 struct dst_entry *dst;
2120 struct flowi6 fl6;
2121
2122 memset(&fl6, 0, sizeof(fl6));
2123 fl6.flowi6_oif = oif;
2124 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2125 fl6.daddr = iph->daddr;
2126 fl6.saddr = iph->saddr;
2127 fl6.flowlabel = ip6_flowinfo(iph);
2128 fl6.flowi6_uid = uid;
2129
2130 dst = ip6_route_output(net, NULL, &fl6);
2131 if (!dst->error)
2132 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2133 dst_release(dst);
2134}
2135EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2136
2137void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2138{
2139 struct dst_entry *dst;
2140
2141 ip6_update_pmtu(skb, sock_net(sk), mtu,
2142 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2143
2144 dst = __sk_dst_get(sk);
2145 if (!dst || !dst->obsolete ||
2146 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2147 return;
2148
2149 bh_lock_sock(sk);
2150 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2151 ip6_datagram_dst_update(sk, false);
2152 bh_unlock_sock(sk);
2153}
2154EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2155
2156/* Handle redirects */
2157struct ip6rd_flowi {
2158 struct flowi6 fl6;
2159 struct in6_addr gateway;
2160};
2161
2162static struct rt6_info *__ip6_route_redirect(struct net *net,
2163 struct fib6_table *table,
2164 struct flowi6 *fl6,
2165 int flags)
2166{
2167 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2168 struct rt6_info *rt, *rt_cache;
2169 struct fib6_node *fn;
2170
2171 /* Get the "current" route for this destination and
2172 * check if the redirect has come from appropriate router.
2173 *
2174 * RFC 4861 specifies that redirects should only be
2175 * accepted if they come from the nexthop to the target.
2176 * Due to the way the routes are chosen, this notion
2177 * is a bit fuzzy and one might need to check all possible
2178 * routes.
2179 */
2180
2181 rcu_read_lock();
2182 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2183restart:
2184 for_each_fib6_node_rt_rcu(fn) {
2185 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2186 continue;
2187 if (rt6_check_expired(rt))
2188 continue;
2189 if (rt->dst.error)
2190 break;
2191 if (!(rt->rt6i_flags & RTF_GATEWAY))
2192 continue;
2193 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2194 continue;
2195 /* rt_cache's gateway might be different from its 'parent'
2196 * in the case of an ip redirect.
2197 * So we keep searching in the exception table if the gateway
2198 * is different.
2199 */
2200 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2201 rt_cache = rt6_find_cached_rt(rt,
2202 &fl6->daddr,
2203 &fl6->saddr);
2204 if (rt_cache &&
2205 ipv6_addr_equal(&rdfl->gateway,
2206 &rt_cache->rt6i_gateway)) {
2207 rt = rt_cache;
2208 break;
2209 }
2210 continue;
2211 }
2212 break;
2213 }
2214
2215 if (!rt)
2216 rt = net->ipv6.ip6_null_entry;
2217 else if (rt->dst.error) {
2218 rt = net->ipv6.ip6_null_entry;
2219 goto out;
2220 }
2221
2222 if (rt == net->ipv6.ip6_null_entry) {
2223 fn = fib6_backtrack(fn, &fl6->saddr);
2224 if (fn)
2225 goto restart;
2226 }
2227
2228out:
2229 ip6_hold_safe(net, &rt, true);
2230
2231 rcu_read_unlock();
2232
2233 trace_fib6_table_lookup(net, rt, table, fl6);
2234 return rt;
2235};
2236
2237static struct dst_entry *ip6_route_redirect(struct net *net,
2238 const struct flowi6 *fl6,
2239 const struct in6_addr *gateway)
2240{
2241 int flags = RT6_LOOKUP_F_HAS_SADDR;
2242 struct ip6rd_flowi rdfl;
2243
2244 rdfl.fl6 = *fl6;
2245 rdfl.gateway = *gateway;
2246
2247 return fib6_rule_lookup(net, &rdfl.fl6,
2248 flags, __ip6_route_redirect);
2249}
2250
2251void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2252 kuid_t uid)
2253{
2254 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2255 struct dst_entry *dst;
2256 struct flowi6 fl6;
2257
2258 memset(&fl6, 0, sizeof(fl6));
2259 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2260 fl6.flowi6_oif = oif;
2261 fl6.flowi6_mark = mark;
2262 fl6.daddr = iph->daddr;
2263 fl6.saddr = iph->saddr;
2264 fl6.flowlabel = ip6_flowinfo(iph);
2265 fl6.flowi6_uid = uid;
2266
2267 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2268 rt6_do_redirect(dst, NULL, skb);
2269 dst_release(dst);
2270}
2271EXPORT_SYMBOL_GPL(ip6_redirect);
2272
2273void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2274 u32 mark)
2275{
2276 const struct ipv6hdr *iph = ipv6_hdr(skb);
2277 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2278 struct dst_entry *dst;
2279 struct flowi6 fl6;
2280
2281 memset(&fl6, 0, sizeof(fl6));
2282 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2283 fl6.flowi6_oif = oif;
2284 fl6.flowi6_mark = mark;
2285 fl6.daddr = msg->dest;
2286 fl6.saddr = iph->daddr;
2287 fl6.flowi6_uid = sock_net_uid(net, NULL);
2288
2289 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2290 rt6_do_redirect(dst, NULL, skb);
2291 dst_release(dst);
2292}
2293
2294void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2295{
2296 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2297 sk->sk_uid);
2298}
2299EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2300
2301static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2302{
2303 struct net_device *dev = dst->dev;
2304 unsigned int mtu = dst_mtu(dst);
2305 struct net *net = dev_net(dev);
2306
2307 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2308
2309 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2310 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2311
2312 /*
2313 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2314 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2315 * IPV6_MAXPLEN is also valid and means: "any MSS,
2316 * rely only on pmtu discovery"
2317 */
2318 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2319 mtu = IPV6_MAXPLEN;
2320 return mtu;
2321}
2322
2323static unsigned int ip6_mtu(const struct dst_entry *dst)
2324{
2325 const struct rt6_info *rt = (const struct rt6_info *)dst;
2326 unsigned int mtu = rt->rt6i_pmtu;
2327 struct inet6_dev *idev;
2328
2329 if (mtu)
2330 goto out;
2331
2332 mtu = dst_metric_raw(dst, RTAX_MTU);
2333 if (mtu)
2334 goto out;
2335
2336 mtu = IPV6_MIN_MTU;
2337
2338 rcu_read_lock();
2339 idev = __in6_dev_get(dst->dev);
2340 if (idev)
2341 mtu = idev->cnf.mtu6;
2342 rcu_read_unlock();
2343
2344out:
2345 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2346
2347 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2348}
2349
2350struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2351 struct flowi6 *fl6)
2352{
2353 struct dst_entry *dst;
2354 struct rt6_info *rt;
2355 struct inet6_dev *idev = in6_dev_get(dev);
2356 struct net *net = dev_net(dev);
2357
2358 if (unlikely(!idev))
2359 return ERR_PTR(-ENODEV);
2360
2361 rt = ip6_dst_alloc(net, dev, 0);
2362 if (unlikely(!rt)) {
2363 in6_dev_put(idev);
2364 dst = ERR_PTR(-ENOMEM);
2365 goto out;
2366 }
2367
2368 rt->dst.flags |= DST_HOST;
2369 rt->dst.input = ip6_input;
2370 rt->dst.output = ip6_output;
2371 rt->rt6i_gateway = fl6->daddr;
2372 rt->rt6i_dst.addr = fl6->daddr;
2373 rt->rt6i_dst.plen = 128;
2374 rt->rt6i_idev = idev;
2375 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2376
2377 /* Add this dst into uncached_list so that rt6_disable_ip() can
2378 * do proper release of the net_device
2379 */
2380 rt6_uncached_list_add(rt);
2381 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2382
2383 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2384
2385out:
2386 return dst;
2387}
2388
2389static int ip6_dst_gc(struct dst_ops *ops)
2390{
2391 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2392 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2393 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2394 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2395 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2396 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2397 int entries;
2398
2399 entries = dst_entries_get_fast(ops);
2400 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2401 entries <= rt_max_size)
2402 goto out;
2403
2404 net->ipv6.ip6_rt_gc_expire++;
2405 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2406 entries = dst_entries_get_slow(ops);
2407 if (entries < ops->gc_thresh)
2408 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2409out:
2410 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2411 return entries > rt_max_size;
2412}
2413
2414static int ip6_convert_metrics(struct mx6_config *mxc,
2415 const struct fib6_config *cfg)
2416{
2417 struct net *net = cfg->fc_nlinfo.nl_net;
2418 bool ecn_ca = false;
2419 struct nlattr *nla;
2420 int remaining;
2421 u32 *mp;
2422
2423 if (!cfg->fc_mx)
2424 return 0;
2425
2426 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2427 if (unlikely(!mp))
2428 return -ENOMEM;
2429
2430 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2431 int type = nla_type(nla);
2432 u32 val;
2433
2434 if (!type)
2435 continue;
2436 if (unlikely(type > RTAX_MAX))
2437 goto err;
2438
2439 if (type == RTAX_CC_ALGO) {
2440 char tmp[TCP_CA_NAME_MAX];
2441
2442 nla_strlcpy(tmp, nla, sizeof(tmp));
2443 val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2444 if (val == TCP_CA_UNSPEC)
2445 goto err;
2446 } else {
2447 val = nla_get_u32(nla);
2448 }
2449 if (type == RTAX_HOPLIMIT && val > 255)
2450 val = 255;
2451 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2452 goto err;
2453
2454 mp[type - 1] = val;
2455 __set_bit(type - 1, mxc->mx_valid);
2456 }
2457
2458 if (ecn_ca) {
2459 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2460 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2461 }
2462
2463 mxc->mx = mp;
2464 return 0;
2465 err:
2466 kfree(mp);
2467 return -EINVAL;
2468}
2469
2470static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2471 struct fib6_config *cfg,
2472 const struct in6_addr *gw_addr,
2473 u32 tbid, int flags)
2474{
2475 struct flowi6 fl6 = {
2476 .flowi6_oif = cfg->fc_ifindex,
2477 .daddr = *gw_addr,
2478 .saddr = cfg->fc_prefsrc,
2479 };
2480 struct fib6_table *table;
2481 struct rt6_info *rt;
2482
2483 table = fib6_get_table(net, tbid);
2484 if (!table)
2485 return NULL;
2486
2487 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2488 flags |= RT6_LOOKUP_F_HAS_SADDR;
2489
2490 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2491 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2492
2493 /* if table lookup failed, fall back to full lookup */
2494 if (rt == net->ipv6.ip6_null_entry) {
2495 ip6_rt_put(rt);
2496 rt = NULL;
2497 }
2498
2499 return rt;
2500}
2501
2502static int ip6_route_check_nh_onlink(struct net *net,
2503 struct fib6_config *cfg,
2504 struct net_device *dev,
2505 struct netlink_ext_ack *extack)
2506{
2507 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2508 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2509 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2510 struct rt6_info *grt;
2511 int err;
2512
2513 err = 0;
2514 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2515 if (grt) {
2516 if (!grt->dst.error &&
2517 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2518 NL_SET_ERR_MSG(extack,
2519 "Nexthop has invalid gateway or device mismatch");
2520 err = -EINVAL;
2521 }
2522
2523 ip6_rt_put(grt);
2524 }
2525
2526 return err;
2527}
2528
2529static int ip6_route_check_nh(struct net *net,
2530 struct fib6_config *cfg,
2531 struct net_device **_dev,
2532 struct inet6_dev **idev)
2533{
2534 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2535 struct net_device *dev = _dev ? *_dev : NULL;
2536 struct rt6_info *grt = NULL;
2537 int err = -EHOSTUNREACH;
2538
2539 if (cfg->fc_table) {
2540 int flags = RT6_LOOKUP_F_IFACE;
2541
2542 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2543 cfg->fc_table, flags);
2544 if (grt) {
2545 if (grt->rt6i_flags & RTF_GATEWAY ||
2546 (dev && dev != grt->dst.dev)) {
2547 ip6_rt_put(grt);
2548 grt = NULL;
2549 }
2550 }
2551 }
2552
2553 if (!grt)
2554 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
2555
2556 if (!grt)
2557 goto out;
2558
2559 if (dev) {
2560 if (dev != grt->dst.dev) {
2561 ip6_rt_put(grt);
2562 goto out;
2563 }
2564 } else {
2565 *_dev = dev = grt->dst.dev;
2566 *idev = grt->rt6i_idev;
2567 dev_hold(dev);
2568 in6_dev_hold(grt->rt6i_idev);
2569 }
2570
2571 if (!(grt->rt6i_flags & RTF_GATEWAY))
2572 err = 0;
2573
2574 ip6_rt_put(grt);
2575
2576out:
2577 return err;
2578}
2579
2580static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2581 struct netlink_ext_ack *extack)
2582{
2583 struct net *net = cfg->fc_nlinfo.nl_net;
2584 struct rt6_info *rt = NULL;
2585 struct net_device *dev = NULL;
2586 struct inet6_dev *idev = NULL;
2587 struct fib6_table *table;
2588 int addr_type;
2589 int err = -EINVAL;
2590
2591 /* RTF_PCPU is an internal flag; can not be set by userspace */
2592 if (cfg->fc_flags & RTF_PCPU) {
2593 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2594 goto out;
2595 }
2596
2597 /* RTF_CACHE is an internal flag; can not be set by userspace */
2598 if (cfg->fc_flags & RTF_CACHE) {
2599 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2600 goto out;
2601 }
2602
2603 if (cfg->fc_dst_len > 128) {
2604 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2605 goto out;
2606 }
2607 if (cfg->fc_src_len > 128) {
2608 NL_SET_ERR_MSG(extack, "Invalid source address length");
2609 goto out;
2610 }
2611#ifndef CONFIG_IPV6_SUBTREES
2612 if (cfg->fc_src_len) {
2613 NL_SET_ERR_MSG(extack,
2614 "Specifying source address requires IPV6_SUBTREES to be enabled");
2615 goto out;
2616 }
2617#endif
2618 if (cfg->fc_ifindex) {
2619 err = -ENODEV;
2620 dev = dev_get_by_index(net, cfg->fc_ifindex);
2621 if (!dev)
2622 goto out;
2623 idev = in6_dev_get(dev);
2624 if (!idev)
2625 goto out;
2626 }
2627
2628 if (cfg->fc_metric == 0)
2629 cfg->fc_metric = IP6_RT_PRIO_USER;
2630
2631 if (cfg->fc_flags & RTNH_F_ONLINK) {
2632 if (!dev) {
2633 NL_SET_ERR_MSG(extack,
2634 "Nexthop device required for onlink");
2635 err = -ENODEV;
2636 goto out;
2637 }
2638
2639 if (!(dev->flags & IFF_UP)) {
2640 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2641 err = -ENETDOWN;
2642 goto out;
2643 }
2644 }
2645
2646 err = -ENOBUFS;
2647 if (cfg->fc_nlinfo.nlh &&
2648 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2649 table = fib6_get_table(net, cfg->fc_table);
2650 if (!table) {
2651 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2652 table = fib6_new_table(net, cfg->fc_table);
2653 }
2654 } else {
2655 table = fib6_new_table(net, cfg->fc_table);
2656 }
2657
2658 if (!table)
2659 goto out;
2660
2661 rt = ip6_dst_alloc(net, NULL,
2662 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2663
2664 if (!rt) {
2665 err = -ENOMEM;
2666 goto out;
2667 }
2668
2669 if (cfg->fc_flags & RTF_EXPIRES)
2670 rt6_set_expires(rt, jiffies +
2671 clock_t_to_jiffies(cfg->fc_expires));
2672 else
2673 rt6_clean_expires(rt);
2674
2675 if (cfg->fc_protocol == RTPROT_UNSPEC)
2676 cfg->fc_protocol = RTPROT_BOOT;
2677 rt->rt6i_protocol = cfg->fc_protocol;
2678
2679 addr_type = ipv6_addr_type(&cfg->fc_dst);
2680
2681 if (addr_type & IPV6_ADDR_MULTICAST)
2682 rt->dst.input = ip6_mc_input;
2683 else if (cfg->fc_flags & RTF_LOCAL)
2684 rt->dst.input = ip6_input;
2685 else
2686 rt->dst.input = ip6_forward;
2687
2688 rt->dst.output = ip6_output;
2689
2690 if (cfg->fc_encap) {
2691 struct lwtunnel_state *lwtstate;
2692
2693 err = lwtunnel_build_state(cfg->fc_encap_type,
2694 cfg->fc_encap, AF_INET6, cfg,
2695 &lwtstate, extack);
2696 if (err)
2697 goto out;
2698 rt->dst.lwtstate = lwtstate_get(lwtstate);
2699 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2700 rt->dst.lwtstate->orig_output = rt->dst.output;
2701 rt->dst.output = lwtunnel_output;
2702 }
2703 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2704 rt->dst.lwtstate->orig_input = rt->dst.input;
2705 rt->dst.input = lwtunnel_input;
2706 }
2707 }
2708
2709 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2710 rt->rt6i_dst.plen = cfg->fc_dst_len;
2711 if (rt->rt6i_dst.plen == 128)
2712 rt->dst.flags |= DST_HOST;
2713
2714#ifdef CONFIG_IPV6_SUBTREES
2715 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2716 rt->rt6i_src.plen = cfg->fc_src_len;
2717#endif
2718
2719 rt->rt6i_metric = cfg->fc_metric;
2720 rt->rt6i_nh_weight = 1;
2721
2722 /* We cannot add true routes via loopback here,
2723 they would result in kernel looping; promote them to reject routes
2724 */
2725 if ((cfg->fc_flags & RTF_REJECT) ||
2726 (dev && (dev->flags & IFF_LOOPBACK) &&
2727 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2728 !(cfg->fc_flags & RTF_LOCAL))) {
2729 /* hold loopback dev/idev if we haven't done so. */
2730 if (dev != net->loopback_dev) {
2731 if (dev) {
2732 dev_put(dev);
2733 in6_dev_put(idev);
2734 }
2735 dev = net->loopback_dev;
2736 dev_hold(dev);
2737 idev = in6_dev_get(dev);
2738 if (!idev) {
2739 err = -ENODEV;
2740 goto out;
2741 }
2742 }
2743 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2744 switch (cfg->fc_type) {
2745 case RTN_BLACKHOLE:
2746 rt->dst.error = -EINVAL;
2747 rt->dst.output = dst_discard_out;
2748 rt->dst.input = dst_discard;
2749 break;
2750 case RTN_PROHIBIT:
2751 rt->dst.error = -EACCES;
2752 rt->dst.output = ip6_pkt_prohibit_out;
2753 rt->dst.input = ip6_pkt_prohibit;
2754 break;
2755 case RTN_THROW:
2756 case RTN_UNREACHABLE:
2757 default:
2758 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2759 : (cfg->fc_type == RTN_UNREACHABLE)
2760 ? -EHOSTUNREACH : -ENETUNREACH;
2761 rt->dst.output = ip6_pkt_discard_out;
2762 rt->dst.input = ip6_pkt_discard;
2763 break;
2764 }
2765 goto install_route;
2766 }
2767
2768 if (cfg->fc_flags & RTF_GATEWAY) {
2769 const struct in6_addr *gw_addr;
2770 int gwa_type;
2771
2772 gw_addr = &cfg->fc_gateway;
2773 gwa_type = ipv6_addr_type(gw_addr);
2774
2775 /* if gw_addr is local we will fail to detect this in case
2776 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2777 * will return already-added prefix route via interface that
2778 * prefix route was assigned to, which might be non-loopback.
2779 */
2780 err = -EINVAL;
2781 if (ipv6_chk_addr_and_flags(net, gw_addr,
2782 gwa_type & IPV6_ADDR_LINKLOCAL ?
2783 dev : NULL, 0, 0)) {
2784 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2785 goto out;
2786 }
2787 rt->rt6i_gateway = *gw_addr;
2788
2789 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2790 /* IPv6 strictly inhibits using not link-local
2791 addresses as nexthop address.
2792 Otherwise, router will not able to send redirects.
2793 It is very good, but in some (rare!) circumstances
2794 (SIT, PtP, NBMA NOARP links) it is handy to allow
2795 some exceptions. --ANK
2796 We allow IPv4-mapped nexthops to support RFC4798-type
2797 addressing
2798 */
2799 if (!(gwa_type & (IPV6_ADDR_UNICAST |
2800 IPV6_ADDR_MAPPED))) {
2801 NL_SET_ERR_MSG(extack,
2802 "Invalid gateway address");
2803 goto out;
2804 }
2805
2806 if (cfg->fc_flags & RTNH_F_ONLINK) {
2807 err = ip6_route_check_nh_onlink(net, cfg, dev,
2808 extack);
2809 } else {
2810 err = ip6_route_check_nh(net, cfg, &dev, &idev);
2811 }
2812 if (err)
2813 goto out;
2814 }
2815 err = -EINVAL;
2816 if (!dev) {
2817 NL_SET_ERR_MSG(extack, "Egress device not specified");
2818 goto out;
2819 } else if (dev->flags & IFF_LOOPBACK) {
2820 NL_SET_ERR_MSG(extack,
2821 "Egress device can not be loopback device for this route");
2822 goto out;
2823 }
2824 }
2825
2826 err = -ENODEV;
2827 if (!dev)
2828 goto out;
2829
2830 if (!(dev->flags & IFF_UP)) {
2831 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2832 err = -ENETDOWN;
2833 goto out;
2834 }
2835
2836 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2837 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2838 NL_SET_ERR_MSG(extack, "Invalid source address");
2839 err = -EINVAL;
2840 goto out;
2841 }
2842 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2843 rt->rt6i_prefsrc.plen = 128;
2844 } else
2845 rt->rt6i_prefsrc.plen = 0;
2846
2847 rt->rt6i_flags = cfg->fc_flags;
2848
2849install_route:
2850 if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2851 !netif_carrier_ok(dev))
2852 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2853 rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
2854 rt->dst.dev = dev;
2855 rt->rt6i_idev = idev;
2856 rt->rt6i_table = table;
2857
2858 cfg->fc_nlinfo.nl_net = dev_net(dev);
2859
2860 return rt;
2861out:
2862 if (dev)
2863 dev_put(dev);
2864 if (idev)
2865 in6_dev_put(idev);
2866 if (rt)
2867 dst_release_immediate(&rt->dst);
2868
2869 return ERR_PTR(err);
2870}
2871
2872int ip6_route_add(struct fib6_config *cfg,
2873 struct netlink_ext_ack *extack)
2874{
2875 struct mx6_config mxc = { .mx = NULL, };
2876 struct rt6_info *rt;
2877 int err;
2878
2879 rt = ip6_route_info_create(cfg, extack);
2880 if (IS_ERR(rt)) {
2881 err = PTR_ERR(rt);
2882 rt = NULL;
2883 goto out;
2884 }
2885
2886 err = ip6_convert_metrics(&mxc, cfg);
2887 if (err)
2888 goto out;
2889
2890 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2891
2892 kfree(mxc.mx);
2893
2894 return err;
2895out:
2896 if (rt)
2897 dst_release_immediate(&rt->dst);
2898
2899 return err;
2900}
2901
2902static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2903{
2904 int err;
2905 struct fib6_table *table;
2906 struct net *net = dev_net(rt->dst.dev);
2907
2908 if (rt == net->ipv6.ip6_null_entry) {
2909 err = -ENOENT;
2910 goto out;
2911 }
2912
2913 table = rt->rt6i_table;
2914 spin_lock_bh(&table->tb6_lock);
2915 err = fib6_del(rt, info);
2916 spin_unlock_bh(&table->tb6_lock);
2917
2918out:
2919 ip6_rt_put(rt);
2920 return err;
2921}
2922
2923int ip6_del_rt(struct rt6_info *rt)
2924{
2925 struct nl_info info = {
2926 .nl_net = dev_net(rt->dst.dev),
2927 };
2928 return __ip6_del_rt(rt, &info);
2929}
2930
2931static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2932{
2933 struct nl_info *info = &cfg->fc_nlinfo;
2934 struct net *net = info->nl_net;
2935 struct sk_buff *skb = NULL;
2936 struct fib6_table *table;
2937 int err = -ENOENT;
2938
2939 if (rt == net->ipv6.ip6_null_entry)
2940 goto out_put;
2941 table = rt->rt6i_table;
2942 spin_lock_bh(&table->tb6_lock);
2943
2944 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2945 struct rt6_info *sibling, *next_sibling;
2946
2947 /* prefer to send a single notification with all hops */
2948 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2949 if (skb) {
2950 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2951
2952 if (rt6_fill_node(net, skb, rt,
2953 NULL, NULL, 0, RTM_DELROUTE,
2954 info->portid, seq, 0) < 0) {
2955 kfree_skb(skb);
2956 skb = NULL;
2957 } else
2958 info->skip_notify = 1;
2959 }
2960
2961 list_for_each_entry_safe(sibling, next_sibling,
2962 &rt->rt6i_siblings,
2963 rt6i_siblings) {
2964 err = fib6_del(sibling, info);
2965 if (err)
2966 goto out_unlock;
2967 }
2968 }
2969
2970 err = fib6_del(rt, info);
2971out_unlock:
2972 spin_unlock_bh(&table->tb6_lock);
2973out_put:
2974 ip6_rt_put(rt);
2975
2976 if (skb) {
2977 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2978 info->nlh, gfp_any());
2979 }
2980 return err;
2981}
2982
2983static int ip6_route_del(struct fib6_config *cfg,
2984 struct netlink_ext_ack *extack)
2985{
2986 struct rt6_info *rt, *rt_cache;
2987 struct fib6_table *table;
2988 struct fib6_node *fn;
2989 int err = -ESRCH;
2990
2991 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2992 if (!table) {
2993 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2994 return err;
2995 }
2996
2997 rcu_read_lock();
2998
2999 fn = fib6_locate(&table->tb6_root,
3000 &cfg->fc_dst, cfg->fc_dst_len,
3001 &cfg->fc_src, cfg->fc_src_len,
3002 !(cfg->fc_flags & RTF_CACHE));
3003
3004 if (fn) {
3005 for_each_fib6_node_rt_rcu(fn) {
3006 if (cfg->fc_flags & RTF_CACHE) {
3007 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3008 &cfg->fc_src);
3009 if (!rt_cache)
3010 continue;
3011 rt = rt_cache;
3012 }
3013 if (cfg->fc_ifindex &&
3014 (!rt->dst.dev ||
3015 rt->dst.dev->ifindex != cfg->fc_ifindex))
3016 continue;
3017 if (cfg->fc_flags & RTF_GATEWAY &&
3018 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3019 continue;
3020 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
3021 continue;
3022 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
3023 continue;
3024 if (!dst_hold_safe(&rt->dst))
3025 break;
3026 rcu_read_unlock();
3027
3028 /* if gateway was specified only delete the one hop */
3029 if (cfg->fc_flags & RTF_GATEWAY)
3030 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3031
3032 return __ip6_del_rt_siblings(rt, cfg);
3033 }
3034 }
3035 rcu_read_unlock();
3036
3037 return err;
3038}
3039
3040static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3041{
3042 struct netevent_redirect netevent;
3043 struct rt6_info *rt, *nrt = NULL;
3044 struct ndisc_options ndopts;
3045 struct inet6_dev *in6_dev;
3046 struct neighbour *neigh;
3047 struct rd_msg *msg;
3048 int optlen, on_link;
3049 u8 *lladdr;
3050
3051 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3052 optlen -= sizeof(*msg);
3053
3054 if (optlen < 0) {
3055 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3056 return;
3057 }
3058
3059 msg = (struct rd_msg *)icmp6_hdr(skb);
3060
3061 if (ipv6_addr_is_multicast(&msg->dest)) {
3062 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3063 return;
3064 }
3065
3066 on_link = 0;
3067 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3068 on_link = 1;
3069 } else if (ipv6_addr_type(&msg->target) !=
3070 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3071 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3072 return;
3073 }
3074
3075 in6_dev = __in6_dev_get(skb->dev);
3076 if (!in6_dev)
3077 return;
3078 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3079 return;
3080
3081 /* RFC2461 8.1:
3082 * The IP source address of the Redirect MUST be the same as the current
3083 * first-hop router for the specified ICMP Destination Address.
3084 */
3085
3086 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3087 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3088 return;
3089 }
3090
3091 lladdr = NULL;
3092 if (ndopts.nd_opts_tgt_lladdr) {
3093 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3094 skb->dev);
3095 if (!lladdr) {
3096 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3097 return;
3098 }
3099 }
3100
3101 rt = (struct rt6_info *) dst;
3102 if (rt->rt6i_flags & RTF_REJECT) {
3103 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3104 return;
3105 }
3106
3107 /* Redirect received -> path was valid.
3108 * Look, redirects are sent only in response to data packets,
3109 * so that this nexthop apparently is reachable. --ANK
3110 */
3111 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3112
3113 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3114 if (!neigh)
3115 return;
3116
3117 /*
3118 * We have finally decided to accept it.
3119 */
3120
3121 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3122 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3123 NEIGH_UPDATE_F_OVERRIDE|
3124 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3125 NEIGH_UPDATE_F_ISROUTER)),
3126 NDISC_REDIRECT, &ndopts);
3127
3128 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3129 if (!nrt)
3130 goto out;
3131
3132 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3133 if (on_link)
3134 nrt->rt6i_flags &= ~RTF_GATEWAY;
3135
3136 nrt->rt6i_protocol = RTPROT_REDIRECT;
3137 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3138
3139 /* No need to remove rt from the exception table if rt is
3140 * a cached route because rt6_insert_exception() will
3141 * takes care of it
3142 */
3143 if (rt6_insert_exception(nrt, rt)) {
3144 dst_release_immediate(&nrt->dst);
3145 goto out;
3146 }
3147
3148 netevent.old = &rt->dst;
3149 netevent.new = &nrt->dst;
3150 netevent.daddr = &msg->dest;
3151 netevent.neigh = neigh;
3152 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3153
3154out:
3155 neigh_release(neigh);
3156}
3157
3158/*
3159 * Misc support functions
3160 */
3161
3162static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3163{
3164 BUG_ON(from->from);
3165
3166 rt->rt6i_flags &= ~RTF_EXPIRES;
3167 dst_hold(&from->dst);
3168 rt->from = from;
3169 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3170}
3171
3172static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3173{
3174 rt->dst.input = ort->dst.input;
3175 rt->dst.output = ort->dst.output;
3176 rt->rt6i_dst = ort->rt6i_dst;
3177 rt->dst.error = ort->dst.error;
3178 rt->rt6i_idev = ort->rt6i_idev;
3179 if (rt->rt6i_idev)
3180 in6_dev_hold(rt->rt6i_idev);
3181 rt->dst.lastuse = jiffies;
3182 rt->rt6i_gateway = ort->rt6i_gateway;
3183 rt->rt6i_flags = ort->rt6i_flags;
3184 rt6_set_from(rt, ort);
3185 rt->rt6i_metric = ort->rt6i_metric;
3186#ifdef CONFIG_IPV6_SUBTREES
3187 rt->rt6i_src = ort->rt6i_src;
3188#endif
3189 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3190 rt->rt6i_table = ort->rt6i_table;
3191 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3192}
3193
3194#ifdef CONFIG_IPV6_ROUTE_INFO
3195static struct rt6_info *rt6_get_route_info(struct net *net,
3196 const struct in6_addr *prefix, int prefixlen,
3197 const struct in6_addr *gwaddr,
3198 struct net_device *dev)
3199{
3200 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3201 int ifindex = dev->ifindex;
3202 struct fib6_node *fn;
3203 struct rt6_info *rt = NULL;
3204 struct fib6_table *table;
3205
3206 table = fib6_get_table(net, tb_id);
3207 if (!table)
3208 return NULL;
3209
3210 rcu_read_lock();
3211 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3212 if (!fn)
3213 goto out;
3214
3215 for_each_fib6_node_rt_rcu(fn) {
3216 if (rt->dst.dev->ifindex != ifindex)
3217 continue;
3218 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3219 continue;
3220 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3221 continue;
3222 ip6_hold_safe(NULL, &rt, false);
3223 break;
3224 }
3225out:
3226 rcu_read_unlock();
3227 return rt;
3228}
3229
3230static struct rt6_info *rt6_add_route_info(struct net *net,
3231 const struct in6_addr *prefix, int prefixlen,
3232 const struct in6_addr *gwaddr,
3233 struct net_device *dev,
3234 unsigned int pref)
3235{
3236 struct fib6_config cfg = {
3237 .fc_metric = IP6_RT_PRIO_USER,
3238 .fc_ifindex = dev->ifindex,
3239 .fc_dst_len = prefixlen,
3240 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3241 RTF_UP | RTF_PREF(pref),
3242 .fc_protocol = RTPROT_RA,
3243 .fc_nlinfo.portid = 0,
3244 .fc_nlinfo.nlh = NULL,
3245 .fc_nlinfo.nl_net = net,
3246 };
3247
3248 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3249 cfg.fc_dst = *prefix;
3250 cfg.fc_gateway = *gwaddr;
3251
3252 /* We should treat it as a default route if prefix length is 0. */
3253 if (!prefixlen)
3254 cfg.fc_flags |= RTF_DEFAULT;
3255
3256 ip6_route_add(&cfg, NULL);
3257
3258 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3259}
3260#endif
3261
3262struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3263{
3264 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3265 struct rt6_info *rt;
3266 struct fib6_table *table;
3267
3268 table = fib6_get_table(dev_net(dev), tb_id);
3269 if (!table)
3270 return NULL;
3271
3272 rcu_read_lock();
3273 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3274 if (dev == rt->dst.dev &&
3275 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3276 ipv6_addr_equal(&rt->rt6i_gateway, addr))
3277 break;
3278 }
3279 if (rt)
3280 ip6_hold_safe(NULL, &rt, false);
3281 rcu_read_unlock();
3282 return rt;
3283}
3284
3285struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3286 struct net_device *dev,
3287 unsigned int pref)
3288{
3289 struct fib6_config cfg = {
3290 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3291 .fc_metric = IP6_RT_PRIO_USER,
3292 .fc_ifindex = dev->ifindex,
3293 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3294 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3295 .fc_protocol = RTPROT_RA,
3296 .fc_nlinfo.portid = 0,
3297 .fc_nlinfo.nlh = NULL,
3298 .fc_nlinfo.nl_net = dev_net(dev),
3299 };
3300
3301 cfg.fc_gateway = *gwaddr;
3302
3303 if (!ip6_route_add(&cfg, NULL)) {
3304 struct fib6_table *table;
3305
3306 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3307 if (table)
3308 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3309 }
3310
3311 return rt6_get_dflt_router(gwaddr, dev);
3312}
3313
3314static void __rt6_purge_dflt_routers(struct fib6_table *table)
3315{
3316 struct rt6_info *rt;
3317
3318restart:
3319 rcu_read_lock();
3320 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3321 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3322 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3323 if (dst_hold_safe(&rt->dst)) {
3324 rcu_read_unlock();
3325 ip6_del_rt(rt);
3326 } else {
3327 rcu_read_unlock();
3328 }
3329 goto restart;
3330 }
3331 }
3332 rcu_read_unlock();
3333
3334 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3335}
3336
3337void rt6_purge_dflt_routers(struct net *net)
3338{
3339 struct fib6_table *table;
3340 struct hlist_head *head;
3341 unsigned int h;
3342
3343 rcu_read_lock();
3344
3345 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3346 head = &net->ipv6.fib_table_hash[h];
3347 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3348 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3349 __rt6_purge_dflt_routers(table);
3350 }
3351 }
3352
3353 rcu_read_unlock();
3354}
3355
3356static void rtmsg_to_fib6_config(struct net *net,
3357 struct in6_rtmsg *rtmsg,
3358 struct fib6_config *cfg)
3359{
3360 memset(cfg, 0, sizeof(*cfg));
3361
3362 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3363 : RT6_TABLE_MAIN;
3364 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3365 cfg->fc_metric = rtmsg->rtmsg_metric;
3366 cfg->fc_expires = rtmsg->rtmsg_info;
3367 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3368 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3369 cfg->fc_flags = rtmsg->rtmsg_flags;
3370
3371 cfg->fc_nlinfo.nl_net = net;
3372
3373 cfg->fc_dst = rtmsg->rtmsg_dst;
3374 cfg->fc_src = rtmsg->rtmsg_src;
3375 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3376}
3377
3378int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3379{
3380 struct fib6_config cfg;
3381 struct in6_rtmsg rtmsg;
3382 int err;
3383
3384 switch (cmd) {
3385 case SIOCADDRT: /* Add a route */
3386 case SIOCDELRT: /* Delete a route */
3387 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3388 return -EPERM;
3389 err = copy_from_user(&rtmsg, arg,
3390 sizeof(struct in6_rtmsg));
3391 if (err)
3392 return -EFAULT;
3393
3394 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3395
3396 rtnl_lock();
3397 switch (cmd) {
3398 case SIOCADDRT:
3399 err = ip6_route_add(&cfg, NULL);
3400 break;
3401 case SIOCDELRT:
3402 err = ip6_route_del(&cfg, NULL);
3403 break;
3404 default:
3405 err = -EINVAL;
3406 }
3407 rtnl_unlock();
3408
3409 return err;
3410 }
3411
3412 return -EINVAL;
3413}
3414
3415/*
3416 * Drop the packet on the floor
3417 */
3418
3419static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3420{
3421 int type;
3422 struct dst_entry *dst = skb_dst(skb);
3423 switch (ipstats_mib_noroutes) {
3424 case IPSTATS_MIB_INNOROUTES:
3425 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3426 if (type == IPV6_ADDR_ANY) {
3427 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3428 IPSTATS_MIB_INADDRERRORS);
3429 break;
3430 }
3431 /* FALLTHROUGH */
3432 case IPSTATS_MIB_OUTNOROUTES:
3433 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3434 ipstats_mib_noroutes);
3435 break;
3436 }
3437 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3438 kfree_skb(skb);
3439 return 0;
3440}
3441
3442static int ip6_pkt_discard(struct sk_buff *skb)
3443{
3444 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3445}
3446
3447static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3448{
3449 skb->dev = skb_dst(skb)->dev;
3450 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3451}
3452
3453static int ip6_pkt_prohibit(struct sk_buff *skb)
3454{
3455 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3456}
3457
3458static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3459{
3460 skb->dev = skb_dst(skb)->dev;
3461 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3462}
3463
3464/*
3465 * Allocate a dst for local (unicast / anycast) address.
3466 */
3467
3468struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3469 const struct in6_addr *addr,
3470 bool anycast)
3471{
3472 u32 tb_id;
3473 struct net *net = dev_net(idev->dev);
3474 struct net_device *dev = idev->dev;
3475 struct rt6_info *rt;
3476
3477 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3478 if (!rt)
3479 return ERR_PTR(-ENOMEM);
3480
3481 in6_dev_hold(idev);
3482
3483 rt->dst.flags |= DST_HOST;
3484 rt->dst.input = ip6_input;
3485 rt->dst.output = ip6_output;
3486 rt->rt6i_idev = idev;
3487
3488 rt->rt6i_protocol = RTPROT_KERNEL;
3489 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3490 if (anycast)
3491 rt->rt6i_flags |= RTF_ANYCAST;
3492 else
3493 rt->rt6i_flags |= RTF_LOCAL;
3494
3495 rt->rt6i_gateway = *addr;
3496 rt->rt6i_dst.addr = *addr;
3497 rt->rt6i_dst.plen = 128;
3498 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3499 rt->rt6i_table = fib6_get_table(net, tb_id);
3500
3501 return rt;
3502}
3503
3504/* remove deleted ip from prefsrc entries */
3505struct arg_dev_net_ip {
3506 struct net_device *dev;
3507 struct net *net;
3508 struct in6_addr *addr;
3509};
3510
3511static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3512{
3513 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3514 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3515 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3516
3517 if (((void *)rt->dst.dev == dev || !dev) &&
3518 rt != net->ipv6.ip6_null_entry &&
3519 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3520 spin_lock_bh(&rt6_exception_lock);
3521 /* remove prefsrc entry */
3522 rt->rt6i_prefsrc.plen = 0;
3523 /* need to update cache as well */
3524 rt6_exceptions_remove_prefsrc(rt);
3525 spin_unlock_bh(&rt6_exception_lock);
3526 }
3527 return 0;
3528}
3529
3530void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3531{
3532 struct net *net = dev_net(ifp->idev->dev);
3533 struct arg_dev_net_ip adni = {
3534 .dev = ifp->idev->dev,
3535 .net = net,
3536 .addr = &ifp->addr,
3537 };
3538 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3539}
3540
3541#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3542
3543/* Remove routers and update dst entries when gateway turn into host. */
3544static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3545{
3546 struct in6_addr *gateway = (struct in6_addr *)arg;
3547
3548 if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3549 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3550 return -1;
3551 }
3552
3553 /* Further clean up cached routes in exception table.
3554 * This is needed because cached route may have a different
3555 * gateway than its 'parent' in the case of an ip redirect.
3556 */
3557 rt6_exceptions_clean_tohost(rt, gateway);
3558
3559 return 0;
3560}
3561
3562void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3563{
3564 fib6_clean_all(net, fib6_clean_tohost, gateway);
3565}
3566
3567struct arg_netdev_event {
3568 const struct net_device *dev;
3569 union {
3570 unsigned int nh_flags;
3571 unsigned long event;
3572 };
3573};
3574
3575static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3576{
3577 struct rt6_info *iter;
3578 struct fib6_node *fn;
3579
3580 fn = rcu_dereference_protected(rt->rt6i_node,
3581 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3582 iter = rcu_dereference_protected(fn->leaf,
3583 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3584 while (iter) {
3585 if (iter->rt6i_metric == rt->rt6i_metric &&
3586 rt6_qualify_for_ecmp(iter))
3587 return iter;
3588 iter = rcu_dereference_protected(iter->rt6_next,
3589 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3590 }
3591
3592 return NULL;
3593}
3594
3595static bool rt6_is_dead(const struct rt6_info *rt)
3596{
3597 if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3598 (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3599 rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3600 return true;
3601
3602 return false;
3603}
3604
3605static int rt6_multipath_total_weight(const struct rt6_info *rt)
3606{
3607 struct rt6_info *iter;
3608 int total = 0;
3609
3610 if (!rt6_is_dead(rt))
3611 total += rt->rt6i_nh_weight;
3612
3613 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3614 if (!rt6_is_dead(iter))
3615 total += iter->rt6i_nh_weight;
3616 }
3617
3618 return total;
3619}
3620
3621static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3622{
3623 int upper_bound = -1;
3624
3625 if (!rt6_is_dead(rt)) {
3626 *weight += rt->rt6i_nh_weight;
3627 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3628 total) - 1;
3629 }
3630 atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3631}
3632
3633static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3634{
3635 struct rt6_info *iter;
3636 int weight = 0;
3637
3638 rt6_upper_bound_set(rt, &weight, total);
3639
3640 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3641 rt6_upper_bound_set(iter, &weight, total);
3642}
3643
3644void rt6_multipath_rebalance(struct rt6_info *rt)
3645{
3646 struct rt6_info *first;
3647 int total;
3648
3649 /* In case the entire multipath route was marked for flushing,
3650 * then there is no need to rebalance upon the removal of every
3651 * sibling route.
3652 */
3653 if (!rt->rt6i_nsiblings || rt->should_flush)
3654 return;
3655
3656 /* During lookup routes are evaluated in order, so we need to
3657 * make sure upper bounds are assigned from the first sibling
3658 * onwards.
3659 */
3660 first = rt6_multipath_first_sibling(rt);
3661 if (WARN_ON_ONCE(!first))
3662 return;
3663
3664 total = rt6_multipath_total_weight(first);
3665 rt6_multipath_upper_bound_set(first, total);
3666}
3667
3668static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3669{
3670 const struct arg_netdev_event *arg = p_arg;
3671 const struct net *net = dev_net(arg->dev);
3672
3673 if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
3674 rt->rt6i_nh_flags &= ~arg->nh_flags;
3675 fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
3676 rt6_multipath_rebalance(rt);
3677 }
3678
3679 return 0;
3680}
3681
3682void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3683{
3684 struct arg_netdev_event arg = {
3685 .dev = dev,
3686 {
3687 .nh_flags = nh_flags,
3688 },
3689 };
3690
3691 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3692 arg.nh_flags |= RTNH_F_LINKDOWN;
3693
3694 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3695}
3696
3697static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3698 const struct net_device *dev)
3699{
3700 struct rt6_info *iter;
3701
3702 if (rt->dst.dev == dev)
3703 return true;
3704 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3705 if (iter->dst.dev == dev)
3706 return true;
3707
3708 return false;
3709}
3710
3711static void rt6_multipath_flush(struct rt6_info *rt)
3712{
3713 struct rt6_info *iter;
3714
3715 rt->should_flush = 1;
3716 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3717 iter->should_flush = 1;
3718}
3719
3720static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3721 const struct net_device *down_dev)
3722{
3723 struct rt6_info *iter;
3724 unsigned int dead = 0;
3725
3726 if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3727 dead++;
3728 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3729 if (iter->dst.dev == down_dev ||
3730 iter->rt6i_nh_flags & RTNH_F_DEAD)
3731 dead++;
3732
3733 return dead;
3734}
3735
3736static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3737 const struct net_device *dev,
3738 unsigned int nh_flags)
3739{
3740 struct rt6_info *iter;
3741
3742 if (rt->dst.dev == dev)
3743 rt->rt6i_nh_flags |= nh_flags;
3744 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3745 if (iter->dst.dev == dev)
3746 iter->rt6i_nh_flags |= nh_flags;
3747}
3748
3749/* called with write lock held for table with rt */
3750static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3751{
3752 const struct arg_netdev_event *arg = p_arg;
3753 const struct net_device *dev = arg->dev;
3754 const struct net *net = dev_net(dev);
3755
3756 if (rt == net->ipv6.ip6_null_entry)
3757 return 0;
3758
3759 switch (arg->event) {
3760 case NETDEV_UNREGISTER:
3761 return rt->dst.dev == dev ? -1 : 0;
3762 case NETDEV_DOWN:
3763 if (rt->should_flush)
3764 return -1;
3765 if (!rt->rt6i_nsiblings)
3766 return rt->dst.dev == dev ? -1 : 0;
3767 if (rt6_multipath_uses_dev(rt, dev)) {
3768 unsigned int count;
3769
3770 count = rt6_multipath_dead_count(rt, dev);
3771 if (rt->rt6i_nsiblings + 1 == count) {
3772 rt6_multipath_flush(rt);
3773 return -1;
3774 }
3775 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3776 RTNH_F_LINKDOWN);
3777 fib6_update_sernum(rt);
3778 rt6_multipath_rebalance(rt);
3779 }
3780 return -2;
3781 case NETDEV_CHANGE:
3782 if (rt->dst.dev != dev ||
3783 rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3784 break;
3785 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3786 rt6_multipath_rebalance(rt);
3787 break;
3788 }
3789
3790 return 0;
3791}
3792
3793void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3794{
3795 struct arg_netdev_event arg = {
3796 .dev = dev,
3797 {
3798 .event = event,
3799 },
3800 };
3801
3802 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3803}
3804
3805void rt6_disable_ip(struct net_device *dev, unsigned long event)
3806{
3807 rt6_sync_down_dev(dev, event);
3808 rt6_uncached_list_flush_dev(dev_net(dev), dev);
3809 neigh_ifdown(&nd_tbl, dev);
3810}
3811
3812struct rt6_mtu_change_arg {
3813 struct net_device *dev;
3814 unsigned int mtu;
3815};
3816
3817static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3818{
3819 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3820 struct inet6_dev *idev;
3821
3822 /* In IPv6 pmtu discovery is not optional,
3823 so that RTAX_MTU lock cannot disable it.
3824 We still use this lock to block changes
3825 caused by addrconf/ndisc.
3826 */
3827
3828 idev = __in6_dev_get(arg->dev);
3829 if (!idev)
3830 return 0;
3831
3832 /* For administrative MTU increase, there is no way to discover
3833 IPv6 PMTU increase, so PMTU increase should be updated here.
3834 Since RFC 1981 doesn't include administrative MTU increase
3835 update PMTU increase is a MUST. (i.e. jumbo frame)
3836 */
3837 if (rt->dst.dev == arg->dev &&
3838 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3839 spin_lock_bh(&rt6_exception_lock);
3840 if (dst_metric_raw(&rt->dst, RTAX_MTU) &&
3841 rt6_mtu_change_route_allowed(idev, rt, arg->mtu))
3842 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3843 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
3844 spin_unlock_bh(&rt6_exception_lock);
3845 }
3846 return 0;
3847}
3848
3849void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3850{
3851 struct rt6_mtu_change_arg arg = {
3852 .dev = dev,
3853 .mtu = mtu,
3854 };
3855
3856 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3857}
3858
3859static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3860 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
3861 [RTA_OIF] = { .type = NLA_U32 },
3862 [RTA_IIF] = { .type = NLA_U32 },
3863 [RTA_PRIORITY] = { .type = NLA_U32 },
3864 [RTA_METRICS] = { .type = NLA_NESTED },
3865 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
3866 [RTA_PREF] = { .type = NLA_U8 },
3867 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
3868 [RTA_ENCAP] = { .type = NLA_NESTED },
3869 [RTA_EXPIRES] = { .type = NLA_U32 },
3870 [RTA_UID] = { .type = NLA_U32 },
3871 [RTA_MARK] = { .type = NLA_U32 },
3872};
3873
3874static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3875 struct fib6_config *cfg,
3876 struct netlink_ext_ack *extack)
3877{
3878 struct rtmsg *rtm;
3879 struct nlattr *tb[RTA_MAX+1];
3880 unsigned int pref;
3881 int err;
3882
3883 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3884 NULL);
3885 if (err < 0)
3886 goto errout;
3887
3888 err = -EINVAL;
3889 rtm = nlmsg_data(nlh);
3890 memset(cfg, 0, sizeof(*cfg));
3891
3892 cfg->fc_table = rtm->rtm_table;
3893 cfg->fc_dst_len = rtm->rtm_dst_len;
3894 cfg->fc_src_len = rtm->rtm_src_len;
3895 cfg->fc_flags = RTF_UP;
3896 cfg->fc_protocol = rtm->rtm_protocol;
3897 cfg->fc_type = rtm->rtm_type;
3898
3899 if (rtm->rtm_type == RTN_UNREACHABLE ||
3900 rtm->rtm_type == RTN_BLACKHOLE ||
3901 rtm->rtm_type == RTN_PROHIBIT ||
3902 rtm->rtm_type == RTN_THROW)
3903 cfg->fc_flags |= RTF_REJECT;
3904
3905 if (rtm->rtm_type == RTN_LOCAL)
3906 cfg->fc_flags |= RTF_LOCAL;
3907
3908 if (rtm->rtm_flags & RTM_F_CLONED)
3909 cfg->fc_flags |= RTF_CACHE;
3910
3911 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
3912
3913 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3914 cfg->fc_nlinfo.nlh = nlh;
3915 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3916
3917 if (tb[RTA_GATEWAY]) {
3918 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3919 cfg->fc_flags |= RTF_GATEWAY;
3920 }
3921
3922 if (tb[RTA_DST]) {
3923 int plen = (rtm->rtm_dst_len + 7) >> 3;
3924
3925 if (nla_len(tb[RTA_DST]) < plen)
3926 goto errout;
3927
3928 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3929 }
3930
3931 if (tb[RTA_SRC]) {
3932 int plen = (rtm->rtm_src_len + 7) >> 3;
3933
3934 if (nla_len(tb[RTA_SRC]) < plen)
3935 goto errout;
3936
3937 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3938 }
3939
3940 if (tb[RTA_PREFSRC])
3941 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3942
3943 if (tb[RTA_OIF])
3944 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3945
3946 if (tb[RTA_PRIORITY])
3947 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3948
3949 if (tb[RTA_METRICS]) {
3950 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3951 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3952 }
3953
3954 if (tb[RTA_TABLE])
3955 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3956
3957 if (tb[RTA_MULTIPATH]) {
3958 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3959 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3960
3961 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3962 cfg->fc_mp_len, extack);
3963 if (err < 0)
3964 goto errout;
3965 }
3966
3967 if (tb[RTA_PREF]) {
3968 pref = nla_get_u8(tb[RTA_PREF]);
3969 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3970 pref != ICMPV6_ROUTER_PREF_HIGH)
3971 pref = ICMPV6_ROUTER_PREF_MEDIUM;
3972 cfg->fc_flags |= RTF_PREF(pref);
3973 }
3974
3975 if (tb[RTA_ENCAP])
3976 cfg->fc_encap = tb[RTA_ENCAP];
3977
3978 if (tb[RTA_ENCAP_TYPE]) {
3979 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3980
3981 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3982 if (err < 0)
3983 goto errout;
3984 }
3985
3986 if (tb[RTA_EXPIRES]) {
3987 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3988
3989 if (addrconf_finite_timeout(timeout)) {
3990 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3991 cfg->fc_flags |= RTF_EXPIRES;
3992 }
3993 }
3994
3995 err = 0;
3996errout:
3997 return err;
3998}
3999
4000struct rt6_nh {
4001 struct rt6_info *rt6_info;
4002 struct fib6_config r_cfg;
4003 struct mx6_config mxc;
4004 struct list_head next;
4005};
4006
4007static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4008{
4009 struct rt6_nh *nh;
4010
4011 list_for_each_entry(nh, rt6_nh_list, next) {
4012 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4013 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4014 nh->r_cfg.fc_ifindex);
4015 }
4016}
4017
4018static int ip6_route_info_append(struct list_head *rt6_nh_list,
4019 struct rt6_info *rt, struct fib6_config *r_cfg)
4020{
4021 struct rt6_nh *nh;
4022 int err = -EEXIST;
4023
4024 list_for_each_entry(nh, rt6_nh_list, next) {
4025 /* check if rt6_info already exists */
4026 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4027 return err;
4028 }
4029
4030 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4031 if (!nh)
4032 return -ENOMEM;
4033 nh->rt6_info = rt;
4034 err = ip6_convert_metrics(&nh->mxc, r_cfg);
4035 if (err) {
4036 kfree(nh);
4037 return err;
4038 }
4039 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4040 list_add_tail(&nh->next, rt6_nh_list);
4041
4042 return 0;
4043}
4044
4045static void ip6_route_mpath_notify(struct rt6_info *rt,
4046 struct rt6_info *rt_last,
4047 struct nl_info *info,
4048 __u16 nlflags)
4049{
4050 /* if this is an APPEND route, then rt points to the first route
4051 * inserted and rt_last points to last route inserted. Userspace
4052 * wants a consistent dump of the route which starts at the first
4053 * nexthop. Since sibling routes are always added at the end of
4054 * the list, find the first sibling of the last route appended
4055 */
4056 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4057 rt = list_first_entry(&rt_last->rt6i_siblings,
4058 struct rt6_info,
4059 rt6i_siblings);
4060 }
4061
4062 if (rt)
4063 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4064}
4065
4066static int ip6_route_multipath_add(struct fib6_config *cfg,
4067 struct netlink_ext_ack *extack)
4068{
4069 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4070 struct nl_info *info = &cfg->fc_nlinfo;
4071 struct fib6_config r_cfg;
4072 struct rtnexthop *rtnh;
4073 struct rt6_info *rt;
4074 struct rt6_nh *err_nh;
4075 struct rt6_nh *nh, *nh_safe;
4076 __u16 nlflags;
4077 int remaining;
4078 int attrlen;
4079 int err = 1;
4080 int nhn = 0;
4081 int replace = (cfg->fc_nlinfo.nlh &&
4082 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4083 LIST_HEAD(rt6_nh_list);
4084
4085 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4086 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4087 nlflags |= NLM_F_APPEND;
4088
4089 remaining = cfg->fc_mp_len;
4090 rtnh = (struct rtnexthop *)cfg->fc_mp;
4091
4092 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4093 * rt6_info structs per nexthop
4094 */
4095 while (rtnh_ok(rtnh, remaining)) {
4096 memcpy(&r_cfg, cfg, sizeof(*cfg));
4097 if (rtnh->rtnh_ifindex)
4098 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4099
4100 attrlen = rtnh_attrlen(rtnh);
4101 if (attrlen > 0) {
4102 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4103
4104 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4105 if (nla) {
4106 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4107 r_cfg.fc_flags |= RTF_GATEWAY;
4108 }
4109 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4110 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4111 if (nla)
4112 r_cfg.fc_encap_type = nla_get_u16(nla);
4113 }
4114
4115 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4116 rt = ip6_route_info_create(&r_cfg, extack);
4117 if (IS_ERR(rt)) {
4118 err = PTR_ERR(rt);
4119 rt = NULL;
4120 goto cleanup;
4121 }
4122
4123 rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4124
4125 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
4126 if (err) {
4127 dst_release_immediate(&rt->dst);
4128 goto cleanup;
4129 }
4130
4131 rtnh = rtnh_next(rtnh, &remaining);
4132 }
4133
4134 /* for add and replace send one notification with all nexthops.
4135 * Skip the notification in fib6_add_rt2node and send one with
4136 * the full route when done
4137 */
4138 info->skip_notify = 1;
4139
4140 err_nh = NULL;
4141 list_for_each_entry(nh, &rt6_nh_list, next) {
4142 rt_last = nh->rt6_info;
4143 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
4144 /* save reference to first route for notification */
4145 if (!rt_notif && !err)
4146 rt_notif = nh->rt6_info;
4147
4148 /* nh->rt6_info is used or freed at this point, reset to NULL*/
4149 nh->rt6_info = NULL;
4150 if (err) {
4151 if (replace && nhn)
4152 ip6_print_replace_route_err(&rt6_nh_list);
4153 err_nh = nh;
4154 goto add_errout;
4155 }
4156
4157 /* Because each route is added like a single route we remove
4158 * these flags after the first nexthop: if there is a collision,
4159 * we have already failed to add the first nexthop:
4160 * fib6_add_rt2node() has rejected it; when replacing, old
4161 * nexthops have been replaced by first new, the rest should
4162 * be added to it.
4163 */
4164 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4165 NLM_F_REPLACE);
4166 nhn++;
4167 }
4168
4169 /* success ... tell user about new route */
4170 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4171 goto cleanup;
4172
4173add_errout:
4174 /* send notification for routes that were added so that
4175 * the delete notifications sent by ip6_route_del are
4176 * coherent
4177 */
4178 if (rt_notif)
4179 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4180
4181 /* Delete routes that were already added */
4182 list_for_each_entry(nh, &rt6_nh_list, next) {
4183 if (err_nh == nh)
4184 break;
4185 ip6_route_del(&nh->r_cfg, extack);
4186 }
4187
4188cleanup:
4189 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4190 if (nh->rt6_info)
4191 dst_release_immediate(&nh->rt6_info->dst);
4192 kfree(nh->mxc.mx);
4193 list_del(&nh->next);
4194 kfree(nh);
4195 }
4196
4197 return err;
4198}
4199
4200static int ip6_route_multipath_del(struct fib6_config *cfg,
4201 struct netlink_ext_ack *extack)
4202{
4203 struct fib6_config r_cfg;
4204 struct rtnexthop *rtnh;
4205 int remaining;
4206 int attrlen;
4207 int err = 1, last_err = 0;
4208
4209 remaining = cfg->fc_mp_len;
4210 rtnh = (struct rtnexthop *)cfg->fc_mp;
4211
4212 /* Parse a Multipath Entry */
4213 while (rtnh_ok(rtnh, remaining)) {
4214 memcpy(&r_cfg, cfg, sizeof(*cfg));
4215 if (rtnh->rtnh_ifindex)
4216 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4217
4218 attrlen = rtnh_attrlen(rtnh);
4219 if (attrlen > 0) {
4220 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4221
4222 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4223 if (nla) {
4224 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4225 r_cfg.fc_flags |= RTF_GATEWAY;
4226 }
4227 }
4228 err = ip6_route_del(&r_cfg, extack);
4229 if (err)
4230 last_err = err;
4231
4232 rtnh = rtnh_next(rtnh, &remaining);
4233 }
4234
4235 return last_err;
4236}
4237
4238static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4239 struct netlink_ext_ack *extack)
4240{
4241 struct fib6_config cfg;
4242 int err;
4243
4244 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4245 if (err < 0)
4246 return err;
4247
4248 if (cfg.fc_mp)
4249 return ip6_route_multipath_del(&cfg, extack);
4250 else {
4251 cfg.fc_delete_all_nh = 1;
4252 return ip6_route_del(&cfg, extack);
4253 }
4254}
4255
4256static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4257 struct netlink_ext_ack *extack)
4258{
4259 struct fib6_config cfg;
4260 int err;
4261
4262 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4263 if (err < 0)
4264 return err;
4265
4266 if (cfg.fc_mp)
4267 return ip6_route_multipath_add(&cfg, extack);
4268 else
4269 return ip6_route_add(&cfg, extack);
4270}
4271
4272static size_t rt6_nlmsg_size(struct rt6_info *rt)
4273{
4274 int nexthop_len = 0;
4275
4276 if (rt->rt6i_nsiblings) {
4277 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4278 + NLA_ALIGN(sizeof(struct rtnexthop))
4279 + nla_total_size(16) /* RTA_GATEWAY */
4280 + lwtunnel_get_encap_size(rt->dst.lwtstate);
4281
4282 nexthop_len *= rt->rt6i_nsiblings;
4283 }
4284
4285 return NLMSG_ALIGN(sizeof(struct rtmsg))
4286 + nla_total_size(16) /* RTA_SRC */
4287 + nla_total_size(16) /* RTA_DST */
4288 + nla_total_size(16) /* RTA_GATEWAY */
4289 + nla_total_size(16) /* RTA_PREFSRC */
4290 + nla_total_size(4) /* RTA_TABLE */
4291 + nla_total_size(4) /* RTA_IIF */
4292 + nla_total_size(4) /* RTA_OIF */
4293 + nla_total_size(4) /* RTA_PRIORITY */
4294 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4295 + nla_total_size(sizeof(struct rta_cacheinfo))
4296 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4297 + nla_total_size(1) /* RTA_PREF */
4298 + lwtunnel_get_encap_size(rt->dst.lwtstate)
4299 + nexthop_len;
4300}
4301
4302static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4303 unsigned int *flags, bool skip_oif)
4304{
4305 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4306 *flags |= RTNH_F_DEAD;
4307
4308 if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4309 *flags |= RTNH_F_LINKDOWN;
4310 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4311 *flags |= RTNH_F_DEAD;
4312 }
4313
4314 if (rt->rt6i_flags & RTF_GATEWAY) {
4315 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4316 goto nla_put_failure;
4317 }
4318
4319 *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
4320 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4321 *flags |= RTNH_F_OFFLOAD;
4322
4323 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4324 if (!skip_oif && rt->dst.dev &&
4325 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4326 goto nla_put_failure;
4327
4328 if (rt->dst.lwtstate &&
4329 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4330 goto nla_put_failure;
4331
4332 return 0;
4333
4334nla_put_failure:
4335 return -EMSGSIZE;
4336}
4337
4338/* add multipath next hop */
4339static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4340{
4341 struct rtnexthop *rtnh;
4342 unsigned int flags = 0;
4343
4344 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4345 if (!rtnh)
4346 goto nla_put_failure;
4347
4348 rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
4349 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4350
4351 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4352 goto nla_put_failure;
4353
4354 rtnh->rtnh_flags = flags;
4355
4356 /* length of rtnetlink header + attributes */
4357 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4358
4359 return 0;
4360
4361nla_put_failure:
4362 return -EMSGSIZE;
4363}
4364
4365static int rt6_fill_node(struct net *net,
4366 struct sk_buff *skb, struct rt6_info *rt,
4367 struct in6_addr *dst, struct in6_addr *src,
4368 int iif, int type, u32 portid, u32 seq,
4369 unsigned int flags)
4370{
4371 u32 metrics[RTAX_MAX];
4372 struct rtmsg *rtm;
4373 struct nlmsghdr *nlh;
4374 long expires;
4375 u32 table;
4376
4377 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4378 if (!nlh)
4379 return -EMSGSIZE;
4380
4381 rtm = nlmsg_data(nlh);
4382 rtm->rtm_family = AF_INET6;
4383 rtm->rtm_dst_len = rt->rt6i_dst.plen;
4384 rtm->rtm_src_len = rt->rt6i_src.plen;
4385 rtm->rtm_tos = 0;
4386 if (rt->rt6i_table)
4387 table = rt->rt6i_table->tb6_id;
4388 else
4389 table = RT6_TABLE_UNSPEC;
4390 rtm->rtm_table = table;
4391 if (nla_put_u32(skb, RTA_TABLE, table))
4392 goto nla_put_failure;
4393 if (rt->rt6i_flags & RTF_REJECT) {
4394 switch (rt->dst.error) {
4395 case -EINVAL:
4396 rtm->rtm_type = RTN_BLACKHOLE;
4397 break;
4398 case -EACCES:
4399 rtm->rtm_type = RTN_PROHIBIT;
4400 break;
4401 case -EAGAIN:
4402 rtm->rtm_type = RTN_THROW;
4403 break;
4404 default:
4405 rtm->rtm_type = RTN_UNREACHABLE;
4406 break;
4407 }
4408 }
4409 else if (rt->rt6i_flags & RTF_LOCAL)
4410 rtm->rtm_type = RTN_LOCAL;
4411 else if (rt->rt6i_flags & RTF_ANYCAST)
4412 rtm->rtm_type = RTN_ANYCAST;
4413 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4414 rtm->rtm_type = RTN_LOCAL;
4415 else
4416 rtm->rtm_type = RTN_UNICAST;
4417 rtm->rtm_flags = 0;
4418 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4419 rtm->rtm_protocol = rt->rt6i_protocol;
4420
4421 if (rt->rt6i_flags & RTF_CACHE)
4422 rtm->rtm_flags |= RTM_F_CLONED;
4423
4424 if (dst) {
4425 if (nla_put_in6_addr(skb, RTA_DST, dst))
4426 goto nla_put_failure;
4427 rtm->rtm_dst_len = 128;
4428 } else if (rtm->rtm_dst_len)
4429 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4430 goto nla_put_failure;
4431#ifdef CONFIG_IPV6_SUBTREES
4432 if (src) {
4433 if (nla_put_in6_addr(skb, RTA_SRC, src))
4434 goto nla_put_failure;
4435 rtm->rtm_src_len = 128;
4436 } else if (rtm->rtm_src_len &&
4437 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4438 goto nla_put_failure;
4439#endif
4440 if (iif) {
4441#ifdef CONFIG_IPV6_MROUTE
4442 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4443 int err = ip6mr_get_route(net, skb, rtm, portid);
4444
4445 if (err == 0)
4446 return 0;
4447 if (err < 0)
4448 goto nla_put_failure;
4449 } else
4450#endif
4451 if (nla_put_u32(skb, RTA_IIF, iif))
4452 goto nla_put_failure;
4453 } else if (dst) {
4454 struct in6_addr saddr_buf;
4455 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4456 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4457 goto nla_put_failure;
4458 }
4459
4460 if (rt->rt6i_prefsrc.plen) {
4461 struct in6_addr saddr_buf;
4462 saddr_buf = rt->rt6i_prefsrc.addr;
4463 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4464 goto nla_put_failure;
4465 }
4466
4467 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4468 if (rt->rt6i_pmtu)
4469 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4470 if (rtnetlink_put_metrics(skb, metrics) < 0)
4471 goto nla_put_failure;
4472
4473 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4474 goto nla_put_failure;
4475
4476 /* For multipath routes, walk the siblings list and add
4477 * each as a nexthop within RTA_MULTIPATH.
4478 */
4479 if (rt->rt6i_nsiblings) {
4480 struct rt6_info *sibling, *next_sibling;
4481 struct nlattr *mp;
4482
4483 mp = nla_nest_start(skb, RTA_MULTIPATH);
4484 if (!mp)
4485 goto nla_put_failure;
4486
4487 if (rt6_add_nexthop(skb, rt) < 0)
4488 goto nla_put_failure;
4489
4490 list_for_each_entry_safe(sibling, next_sibling,
4491 &rt->rt6i_siblings, rt6i_siblings) {
4492 if (rt6_add_nexthop(skb, sibling) < 0)
4493 goto nla_put_failure;
4494 }
4495
4496 nla_nest_end(skb, mp);
4497 } else {
4498 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4499 goto nla_put_failure;
4500 }
4501
4502 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4503
4504 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4505 goto nla_put_failure;
4506
4507 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4508 goto nla_put_failure;
4509
4510
4511 nlmsg_end(skb, nlh);
4512 return 0;
4513
4514nla_put_failure:
4515 nlmsg_cancel(skb, nlh);
4516 return -EMSGSIZE;
4517}
4518
4519int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4520{
4521 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4522 struct net *net = arg->net;
4523
4524 if (rt == net->ipv6.ip6_null_entry)
4525 return 0;
4526
4527 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4528 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4529
4530 /* user wants prefix routes only */
4531 if (rtm->rtm_flags & RTM_F_PREFIX &&
4532 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4533 /* success since this is not a prefix route */
4534 return 1;
4535 }
4536 }
4537
4538 return rt6_fill_node(net,
4539 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4540 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4541 NLM_F_MULTI);
4542}
4543
4544static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4545 struct netlink_ext_ack *extack)
4546{
4547 struct net *net = sock_net(in_skb->sk);
4548 struct nlattr *tb[RTA_MAX+1];
4549 int err, iif = 0, oif = 0;
4550 struct dst_entry *dst;
4551 struct rt6_info *rt;
4552 struct sk_buff *skb;
4553 struct rtmsg *rtm;
4554 struct flowi6 fl6;
4555 bool fibmatch;
4556
4557 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4558 extack);
4559 if (err < 0)
4560 goto errout;
4561
4562 err = -EINVAL;
4563 memset(&fl6, 0, sizeof(fl6));
4564 rtm = nlmsg_data(nlh);
4565 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4566 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4567
4568 if (tb[RTA_SRC]) {
4569 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4570 goto errout;
4571
4572 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4573 }
4574
4575 if (tb[RTA_DST]) {
4576 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4577 goto errout;
4578
4579 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4580 }
4581
4582 if (tb[RTA_IIF])
4583 iif = nla_get_u32(tb[RTA_IIF]);
4584
4585 if (tb[RTA_OIF])
4586 oif = nla_get_u32(tb[RTA_OIF]);
4587
4588 if (tb[RTA_MARK])
4589 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4590
4591 if (tb[RTA_UID])
4592 fl6.flowi6_uid = make_kuid(current_user_ns(),
4593 nla_get_u32(tb[RTA_UID]));
4594 else
4595 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4596
4597 if (iif) {
4598 struct net_device *dev;
4599 int flags = 0;
4600
4601 rcu_read_lock();
4602
4603 dev = dev_get_by_index_rcu(net, iif);
4604 if (!dev) {
4605 rcu_read_unlock();
4606 err = -ENODEV;
4607 goto errout;
4608 }
4609
4610 fl6.flowi6_iif = iif;
4611
4612 if (!ipv6_addr_any(&fl6.saddr))
4613 flags |= RT6_LOOKUP_F_HAS_SADDR;
4614
4615 dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4616
4617 rcu_read_unlock();
4618 } else {
4619 fl6.flowi6_oif = oif;
4620
4621 dst = ip6_route_output(net, NULL, &fl6);
4622 }
4623
4624
4625 rt = container_of(dst, struct rt6_info, dst);
4626 if (rt->dst.error) {
4627 err = rt->dst.error;
4628 ip6_rt_put(rt);
4629 goto errout;
4630 }
4631
4632 if (rt == net->ipv6.ip6_null_entry) {
4633 err = rt->dst.error;
4634 ip6_rt_put(rt);
4635 goto errout;
4636 }
4637
4638 if (fibmatch && rt->from) {
4639 struct rt6_info *ort = rt->from;
4640
4641 dst_hold(&ort->dst);
4642 ip6_rt_put(rt);
4643 rt = ort;
4644 }
4645
4646 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4647 if (!skb) {
4648 ip6_rt_put(rt);
4649 err = -ENOBUFS;
4650 goto errout;
4651 }
4652
4653 skb_dst_set(skb, &rt->dst);
4654 if (fibmatch)
4655 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4656 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4657 nlh->nlmsg_seq, 0);
4658 else
4659 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4660 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4661 nlh->nlmsg_seq, 0);
4662 if (err < 0) {
4663 kfree_skb(skb);
4664 goto errout;
4665 }
4666
4667 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4668errout:
4669 return err;
4670}
4671
4672void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4673 unsigned int nlm_flags)
4674{
4675 struct sk_buff *skb;
4676 struct net *net = info->nl_net;
4677 u32 seq;
4678 int err;
4679
4680 err = -ENOBUFS;
4681 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4682
4683 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4684 if (!skb)
4685 goto errout;
4686
4687 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4688 event, info->portid, seq, nlm_flags);
4689 if (err < 0) {
4690 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4691 WARN_ON(err == -EMSGSIZE);
4692 kfree_skb(skb);
4693 goto errout;
4694 }
4695 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4696 info->nlh, gfp_any());
4697 return;
4698errout:
4699 if (err < 0)
4700 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4701}
4702
4703static int ip6_route_dev_notify(struct notifier_block *this,
4704 unsigned long event, void *ptr)
4705{
4706 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4707 struct net *net = dev_net(dev);
4708
4709 if (!(dev->flags & IFF_LOOPBACK))
4710 return NOTIFY_OK;
4711
4712 if (event == NETDEV_REGISTER) {
4713 net->ipv6.ip6_null_entry->dst.dev = dev;
4714 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4715#ifdef CONFIG_IPV6_MULTIPLE_TABLES
4716 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4717 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4718 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4719 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4720#endif
4721 } else if (event == NETDEV_UNREGISTER &&
4722 dev->reg_state != NETREG_UNREGISTERED) {
4723 /* NETDEV_UNREGISTER could be fired for multiple times by
4724 * netdev_wait_allrefs(). Make sure we only call this once.
4725 */
4726 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4727#ifdef CONFIG_IPV6_MULTIPLE_TABLES
4728 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4729 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4730#endif
4731 }
4732
4733 return NOTIFY_OK;
4734}
4735
4736/*
4737 * /proc
4738 */
4739
4740#ifdef CONFIG_PROC_FS
4741
4742static const struct file_operations ipv6_route_proc_fops = {
4743 .open = ipv6_route_open,
4744 .read = seq_read,
4745 .llseek = seq_lseek,
4746 .release = seq_release_net,
4747};
4748
4749static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4750{
4751 struct net *net = (struct net *)seq->private;
4752 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4753 net->ipv6.rt6_stats->fib_nodes,
4754 net->ipv6.rt6_stats->fib_route_nodes,
4755 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4756 net->ipv6.rt6_stats->fib_rt_entries,
4757 net->ipv6.rt6_stats->fib_rt_cache,
4758 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4759 net->ipv6.rt6_stats->fib_discarded_routes);
4760
4761 return 0;
4762}
4763
4764static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4765{
4766 return single_open_net(inode, file, rt6_stats_seq_show);
4767}
4768
4769static const struct file_operations rt6_stats_seq_fops = {
4770 .open = rt6_stats_seq_open,
4771 .read = seq_read,
4772 .llseek = seq_lseek,
4773 .release = single_release_net,
4774};
4775#endif /* CONFIG_PROC_FS */
4776
4777#ifdef CONFIG_SYSCTL
4778
4779static
4780int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4781 void __user *buffer, size_t *lenp, loff_t *ppos)
4782{
4783 struct net *net;
4784 int delay;
4785 if (!write)
4786 return -EINVAL;
4787
4788 net = (struct net *)ctl->extra1;
4789 delay = net->ipv6.sysctl.flush_delay;
4790 proc_dointvec(ctl, write, buffer, lenp, ppos);
4791 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4792 return 0;
4793}
4794
4795struct ctl_table ipv6_route_table_template[] = {
4796 {
4797 .procname = "flush",
4798 .data = &init_net.ipv6.sysctl.flush_delay,
4799 .maxlen = sizeof(int),
4800 .mode = 0200,
4801 .proc_handler = ipv6_sysctl_rtcache_flush
4802 },
4803 {
4804 .procname = "gc_thresh",
4805 .data = &ip6_dst_ops_template.gc_thresh,
4806 .maxlen = sizeof(int),
4807 .mode = 0644,
4808 .proc_handler = proc_dointvec,
4809 },
4810 {
4811 .procname = "max_size",
4812 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
4813 .maxlen = sizeof(int),
4814 .mode = 0644,
4815 .proc_handler = proc_dointvec,
4816 },
4817 {
4818 .procname = "gc_min_interval",
4819 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4820 .maxlen = sizeof(int),
4821 .mode = 0644,
4822 .proc_handler = proc_dointvec_jiffies,
4823 },
4824 {
4825 .procname = "gc_timeout",
4826 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4827 .maxlen = sizeof(int),
4828 .mode = 0644,
4829 .proc_handler = proc_dointvec_jiffies,
4830 },
4831 {
4832 .procname = "gc_interval",
4833 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4834 .maxlen = sizeof(int),
4835 .mode = 0644,
4836 .proc_handler = proc_dointvec_jiffies,
4837 },
4838 {
4839 .procname = "gc_elasticity",
4840 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4841 .maxlen = sizeof(int),
4842 .mode = 0644,
4843 .proc_handler = proc_dointvec,
4844 },
4845 {
4846 .procname = "mtu_expires",
4847 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4848 .maxlen = sizeof(int),
4849 .mode = 0644,
4850 .proc_handler = proc_dointvec_jiffies,
4851 },
4852 {
4853 .procname = "min_adv_mss",
4854 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4855 .maxlen = sizeof(int),
4856 .mode = 0644,
4857 .proc_handler = proc_dointvec,
4858 },
4859 {
4860 .procname = "gc_min_interval_ms",
4861 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4862 .maxlen = sizeof(int),
4863 .mode = 0644,
4864 .proc_handler = proc_dointvec_ms_jiffies,
4865 },
4866 { }
4867};
4868
4869struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4870{
4871 struct ctl_table *table;
4872
4873 table = kmemdup(ipv6_route_table_template,
4874 sizeof(ipv6_route_table_template),
4875 GFP_KERNEL);
4876
4877 if (table) {
4878 table[0].data = &net->ipv6.sysctl.flush_delay;
4879 table[0].extra1 = net;
4880 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4881 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4882 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4883 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4884 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4885 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4886 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4887 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4888 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4889
4890 /* Don't export sysctls to unprivileged users */
4891 if (net->user_ns != &init_user_ns)
4892 table[0].procname = NULL;
4893 }
4894
4895 return table;
4896}
4897#endif
4898
4899static int __net_init ip6_route_net_init(struct net *net)
4900{
4901 int ret = -ENOMEM;
4902
4903 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4904 sizeof(net->ipv6.ip6_dst_ops));
4905
4906 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4907 goto out_ip6_dst_ops;
4908
4909 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4910 sizeof(*net->ipv6.ip6_null_entry),
4911 GFP_KERNEL);
4912 if (!net->ipv6.ip6_null_entry)
4913 goto out_ip6_dst_entries;
4914 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4915 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4916 ip6_template_metrics, true);
4917
4918#ifdef CONFIG_IPV6_MULTIPLE_TABLES
4919 net->ipv6.fib6_has_custom_rules = false;
4920 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4921 sizeof(*net->ipv6.ip6_prohibit_entry),
4922 GFP_KERNEL);
4923 if (!net->ipv6.ip6_prohibit_entry)
4924 goto out_ip6_null_entry;
4925 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4926 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4927 ip6_template_metrics, true);
4928
4929 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4930 sizeof(*net->ipv6.ip6_blk_hole_entry),
4931 GFP_KERNEL);
4932 if (!net->ipv6.ip6_blk_hole_entry)
4933 goto out_ip6_prohibit_entry;
4934 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4935 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4936 ip6_template_metrics, true);
4937#endif
4938
4939 net->ipv6.sysctl.flush_delay = 0;
4940 net->ipv6.sysctl.ip6_rt_max_size = 4096;
4941 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4942 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4943 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4944 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4945 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4946 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4947
4948 net->ipv6.ip6_rt_gc_expire = 30*HZ;
4949
4950 ret = 0;
4951out:
4952 return ret;
4953
4954#ifdef CONFIG_IPV6_MULTIPLE_TABLES
4955out_ip6_prohibit_entry:
4956 kfree(net->ipv6.ip6_prohibit_entry);
4957out_ip6_null_entry:
4958 kfree(net->ipv6.ip6_null_entry);
4959#endif
4960out_ip6_dst_entries:
4961 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4962out_ip6_dst_ops:
4963 goto out;
4964}
4965
4966static void __net_exit ip6_route_net_exit(struct net *net)
4967{
4968 kfree(net->ipv6.ip6_null_entry);
4969#ifdef CONFIG_IPV6_MULTIPLE_TABLES
4970 kfree(net->ipv6.ip6_prohibit_entry);
4971 kfree(net->ipv6.ip6_blk_hole_entry);
4972#endif
4973 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4974}
4975
4976static int __net_init ip6_route_net_init_late(struct net *net)
4977{
4978#ifdef CONFIG_PROC_FS
4979 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4980 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4981#endif
4982 return 0;
4983}
4984
4985static void __net_exit ip6_route_net_exit_late(struct net *net)
4986{
4987#ifdef CONFIG_PROC_FS
4988 remove_proc_entry("ipv6_route", net->proc_net);
4989 remove_proc_entry("rt6_stats", net->proc_net);
4990#endif
4991}
4992
4993static struct pernet_operations ip6_route_net_ops = {
4994 .init = ip6_route_net_init,
4995 .exit = ip6_route_net_exit,
4996};
4997
4998static int __net_init ipv6_inetpeer_init(struct net *net)
4999{
5000 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5001
5002 if (!bp)
5003 return -ENOMEM;
5004 inet_peer_base_init(bp);
5005 net->ipv6.peers = bp;
5006 return 0;
5007}
5008
5009static void __net_exit ipv6_inetpeer_exit(struct net *net)
5010{
5011 struct inet_peer_base *bp = net->ipv6.peers;
5012
5013 net->ipv6.peers = NULL;
5014 inetpeer_invalidate_tree(bp);
5015 kfree(bp);
5016}
5017
5018static struct pernet_operations ipv6_inetpeer_ops = {
5019 .init = ipv6_inetpeer_init,
5020 .exit = ipv6_inetpeer_exit,
5021};
5022
5023static struct pernet_operations ip6_route_net_late_ops = {
5024 .init = ip6_route_net_init_late,
5025 .exit = ip6_route_net_exit_late,
5026};
5027
5028static struct notifier_block ip6_route_dev_notifier = {
5029 .notifier_call = ip6_route_dev_notify,
5030 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5031};
5032
5033void __init ip6_route_init_special_entries(void)
5034{
5035 /* Registering of the loopback is done before this portion of code,
5036 * the loopback reference in rt6_info will not be taken, do it
5037 * manually for init_net */
5038 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5039 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5040 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5041 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5042 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5043 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5044 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5045 #endif
5046}
5047
5048int __init ip6_route_init(void)
5049{
5050 int ret;
5051 int cpu;
5052
5053 ret = -ENOMEM;
5054 ip6_dst_ops_template.kmem_cachep =
5055 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5056 SLAB_HWCACHE_ALIGN, NULL);
5057 if (!ip6_dst_ops_template.kmem_cachep)
5058 goto out;
5059
5060 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5061 if (ret)
5062 goto out_kmem_cache;
5063
5064 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5065 if (ret)
5066 goto out_dst_entries;
5067
5068 ret = register_pernet_subsys(&ip6_route_net_ops);
5069 if (ret)
5070 goto out_register_inetpeer;
5071
5072 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5073
5074 ret = fib6_init();
5075 if (ret)
5076 goto out_register_subsys;
5077
5078 ret = xfrm6_init();
5079 if (ret)
5080 goto out_fib6_init;
5081
5082 ret = fib6_rules_init();
5083 if (ret)
5084 goto xfrm6_init;
5085
5086 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5087 if (ret)
5088 goto fib6_rules_init;
5089
5090 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5091 inet6_rtm_newroute, NULL, 0);
5092 if (ret < 0)
5093 goto out_register_late_subsys;
5094
5095 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5096 inet6_rtm_delroute, NULL, 0);
5097 if (ret < 0)
5098 goto out_register_late_subsys;
5099
5100 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5101 inet6_rtm_getroute, NULL,
5102 RTNL_FLAG_DOIT_UNLOCKED);
5103 if (ret < 0)
5104 goto out_register_late_subsys;
5105
5106 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5107 if (ret)
5108 goto out_register_late_subsys;
5109
5110 for_each_possible_cpu(cpu) {
5111 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5112
5113 INIT_LIST_HEAD(&ul->head);
5114 spin_lock_init(&ul->lock);
5115 }
5116
5117out:
5118 return ret;
5119
5120out_register_late_subsys:
5121 rtnl_unregister_all(PF_INET6);
5122 unregister_pernet_subsys(&ip6_route_net_late_ops);
5123fib6_rules_init:
5124 fib6_rules_cleanup();
5125xfrm6_init:
5126 xfrm6_fini();
5127out_fib6_init:
5128 fib6_gc_cleanup();
5129out_register_subsys:
5130 unregister_pernet_subsys(&ip6_route_net_ops);
5131out_register_inetpeer:
5132 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5133out_dst_entries:
5134 dst_entries_destroy(&ip6_dst_blackhole_ops);
5135out_kmem_cache:
5136 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5137 goto out;
5138}
5139
5140void ip6_route_cleanup(void)
5141{
5142 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5143 unregister_pernet_subsys(&ip6_route_net_late_ops);
5144 fib6_rules_cleanup();
5145 xfrm6_fini();
5146 fib6_gc_cleanup();
5147 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5148 unregister_pernet_subsys(&ip6_route_net_ops);
5149 dst_entries_destroy(&ip6_dst_blackhole_ops);
5150 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5151}