2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
68 #include <linux/uaccess.h>
71 #include <linux/sysctl.h>
75 RT6_NUD_FAIL_HARD = -3,
76 RT6_NUD_FAIL_PROBE = -2,
77 RT6_NUD_FAIL_DO_RR = -1,
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void ip6_dst_destroy(struct dst_entry *);
87 static void ip6_dst_ifdown(struct dst_entry *,
88 struct net_device *dev, int how);
89 static int ip6_dst_gc(struct dst_ops *ops);
91 static int ip6_pkt_discard(struct sk_buff *skb);
92 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int ip6_pkt_prohibit(struct sk_buff *skb);
94 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void ip6_link_failure(struct sk_buff *skb);
96 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 struct sk_buff *skb, u32 mtu);
98 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
100 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 struct sk_buff *skb, struct rt6_info *rt,
105 struct in6_addr *dst, struct in6_addr *src,
106 int iif, int type, u32 portid, u32 seq,
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 struct in6_addr *daddr,
110 struct in6_addr *saddr);
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114 const struct in6_addr *prefix, int prefixlen,
115 const struct in6_addr *gwaddr,
116 struct net_device *dev,
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119 const struct in6_addr *prefix, int prefixlen,
120 const struct in6_addr *gwaddr,
121 struct net_device *dev);
124 struct uncached_list {
126 struct list_head head;
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
131 static void rt6_uncached_list_add(struct rt6_info *rt)
133 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
135 rt->rt6i_uncached_list = ul;
137 spin_lock_bh(&ul->lock);
138 list_add_tail(&rt->rt6i_uncached, &ul->head);
139 spin_unlock_bh(&ul->lock);
142 static void rt6_uncached_list_del(struct rt6_info *rt)
144 if (!list_empty(&rt->rt6i_uncached)) {
145 struct uncached_list *ul = rt->rt6i_uncached_list;
147 spin_lock_bh(&ul->lock);
148 list_del(&rt->rt6i_uncached);
149 spin_unlock_bh(&ul->lock);
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
155 struct net_device *loopback_dev = net->loopback_dev;
158 if (dev == loopback_dev)
161 for_each_possible_cpu(cpu) {
162 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 spin_lock_bh(&ul->lock);
166 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167 struct inet6_dev *rt_idev = rt->rt6i_idev;
168 struct net_device *rt_dev = rt->dst.dev;
170 if (rt_idev->dev == dev) {
171 rt->rt6i_idev = in6_dev_get(loopback_dev);
172 in6_dev_put(rt_idev);
176 rt->dst.dev = loopback_dev;
177 dev_hold(rt->dst.dev);
181 spin_unlock_bh(&ul->lock);
185 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
187 return dst_metrics_write_ptr(rt->dst.from);
190 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
192 struct rt6_info *rt = (struct rt6_info *)dst;
194 if (rt->rt6i_flags & RTF_PCPU)
195 return rt6_pcpu_cow_metrics(rt);
196 else if (rt->rt6i_flags & RTF_CACHE)
199 return dst_cow_metrics_generic(dst, old);
202 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
206 struct in6_addr *p = &rt->rt6i_gateway;
208 if (!ipv6_addr_any(p))
209 return (const void *) p;
211 return &ipv6_hdr(skb)->daddr;
215 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
219 struct rt6_info *rt = (struct rt6_info *) dst;
222 daddr = choose_neigh_daddr(rt, skb, daddr);
223 n = __ipv6_neigh_lookup(dst->dev, daddr);
226 return neigh_create(&nd_tbl, daddr, dst->dev);
229 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
231 struct net_device *dev = dst->dev;
232 struct rt6_info *rt = (struct rt6_info *)dst;
234 daddr = choose_neigh_daddr(rt, NULL, daddr);
237 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
239 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
241 __ipv6_confirm_neigh(dev, daddr);
244 static struct dst_ops ip6_dst_ops_template = {
248 .check = ip6_dst_check,
249 .default_advmss = ip6_default_advmss,
251 .cow_metrics = ipv6_cow_metrics,
252 .destroy = ip6_dst_destroy,
253 .ifdown = ip6_dst_ifdown,
254 .negative_advice = ip6_negative_advice,
255 .link_failure = ip6_link_failure,
256 .update_pmtu = ip6_rt_update_pmtu,
257 .redirect = rt6_do_redirect,
258 .local_out = __ip6_local_out,
259 .neigh_lookup = ip6_neigh_lookup,
260 .confirm_neigh = ip6_confirm_neigh,
263 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
265 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
267 return mtu ? : dst->dev->mtu;
270 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
271 struct sk_buff *skb, u32 mtu)
275 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
280 static struct dst_ops ip6_dst_blackhole_ops = {
282 .destroy = ip6_dst_destroy,
283 .check = ip6_dst_check,
284 .mtu = ip6_blackhole_mtu,
285 .default_advmss = ip6_default_advmss,
286 .update_pmtu = ip6_rt_blackhole_update_pmtu,
287 .redirect = ip6_rt_blackhole_redirect,
288 .cow_metrics = dst_cow_metrics_generic,
289 .neigh_lookup = ip6_neigh_lookup,
292 static const u32 ip6_template_metrics[RTAX_MAX] = {
293 [RTAX_HOPLIMIT - 1] = 0,
296 static const struct rt6_info ip6_null_entry_template = {
298 .__refcnt = ATOMIC_INIT(1),
300 .obsolete = DST_OBSOLETE_FORCE_CHK,
301 .error = -ENETUNREACH,
302 .input = ip6_pkt_discard,
303 .output = ip6_pkt_discard_out,
305 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
306 .rt6i_protocol = RTPROT_KERNEL,
307 .rt6i_metric = ~(u32) 0,
308 .rt6i_ref = ATOMIC_INIT(1),
311 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
313 static const struct rt6_info ip6_prohibit_entry_template = {
315 .__refcnt = ATOMIC_INIT(1),
317 .obsolete = DST_OBSOLETE_FORCE_CHK,
319 .input = ip6_pkt_prohibit,
320 .output = ip6_pkt_prohibit_out,
322 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
323 .rt6i_protocol = RTPROT_KERNEL,
324 .rt6i_metric = ~(u32) 0,
325 .rt6i_ref = ATOMIC_INIT(1),
328 static const struct rt6_info ip6_blk_hole_entry_template = {
330 .__refcnt = ATOMIC_INIT(1),
332 .obsolete = DST_OBSOLETE_FORCE_CHK,
334 .input = dst_discard,
335 .output = dst_discard_out,
337 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
338 .rt6i_protocol = RTPROT_KERNEL,
339 .rt6i_metric = ~(u32) 0,
340 .rt6i_ref = ATOMIC_INIT(1),
345 static void rt6_info_init(struct rt6_info *rt)
347 struct dst_entry *dst = &rt->dst;
349 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
350 INIT_LIST_HEAD(&rt->rt6i_siblings);
351 INIT_LIST_HEAD(&rt->rt6i_uncached);
354 /* allocate dst with ip6_dst_ops */
355 static struct rt6_info *__ip6_dst_alloc(struct net *net,
356 struct net_device *dev,
359 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
360 1, DST_OBSOLETE_FORCE_CHK, flags);
368 struct rt6_info *ip6_dst_alloc(struct net *net,
369 struct net_device *dev,
372 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
375 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
379 for_each_possible_cpu(cpu) {
382 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
383 /* no one shares rt */
387 dst_release_immediate(&rt->dst);
394 EXPORT_SYMBOL(ip6_dst_alloc);
396 static void ip6_dst_destroy(struct dst_entry *dst)
398 struct rt6_info *rt = (struct rt6_info *)dst;
399 struct rt6_exception_bucket *bucket;
400 struct dst_entry *from = dst->from;
401 struct inet6_dev *idev;
403 dst_destroy_metrics_generic(dst);
404 free_percpu(rt->rt6i_pcpu);
405 rt6_uncached_list_del(rt);
407 idev = rt->rt6i_idev;
409 rt->rt6i_idev = NULL;
412 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
414 rt->rt6i_exception_bucket = NULL;
422 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
425 struct rt6_info *rt = (struct rt6_info *)dst;
426 struct inet6_dev *idev = rt->rt6i_idev;
427 struct net_device *loopback_dev =
428 dev_net(dev)->loopback_dev;
430 if (idev && idev->dev != loopback_dev) {
431 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
433 rt->rt6i_idev = loopback_idev;
439 static bool __rt6_check_expired(const struct rt6_info *rt)
441 if (rt->rt6i_flags & RTF_EXPIRES)
442 return time_after(jiffies, rt->dst.expires);
447 static bool rt6_check_expired(const struct rt6_info *rt)
449 if (rt->rt6i_flags & RTF_EXPIRES) {
450 if (time_after(jiffies, rt->dst.expires))
452 } else if (rt->dst.from) {
453 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
454 rt6_check_expired((struct rt6_info *)rt->dst.from);
459 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
460 struct flowi6 *fl6, int oif,
463 struct rt6_info *sibling, *next_sibling;
466 /* We might have already computed the hash for ICMPv6 errors. In such
467 * case it will always be non-zero. Otherwise now is the time to do it.
470 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
472 route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
473 /* Don't change the route, if route_choosen == 0
474 * (siblings does not include ourself)
477 list_for_each_entry_safe(sibling, next_sibling,
478 &match->rt6i_siblings, rt6i_siblings) {
480 if (route_choosen == 0) {
481 if (rt6_score_route(sibling, oif, strict) < 0)
491 * Route lookup. Any table->tb6_lock is implied.
494 static inline struct rt6_info *rt6_device_match(struct net *net,
496 const struct in6_addr *saddr,
500 struct rt6_info *local = NULL;
501 struct rt6_info *sprt;
503 if (!oif && ipv6_addr_any(saddr))
506 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
507 struct net_device *dev = sprt->dst.dev;
510 if (dev->ifindex == oif)
512 if (dev->flags & IFF_LOOPBACK) {
513 if (!sprt->rt6i_idev ||
514 sprt->rt6i_idev->dev->ifindex != oif) {
515 if (flags & RT6_LOOKUP_F_IFACE)
518 local->rt6i_idev->dev->ifindex == oif)
524 if (ipv6_chk_addr(net, saddr, dev,
525 flags & RT6_LOOKUP_F_IFACE))
534 if (flags & RT6_LOOKUP_F_IFACE)
535 return net->ipv6.ip6_null_entry;
541 #ifdef CONFIG_IPV6_ROUTER_PREF
542 struct __rt6_probe_work {
543 struct work_struct work;
544 struct in6_addr target;
545 struct net_device *dev;
548 static void rt6_probe_deferred(struct work_struct *w)
550 struct in6_addr mcaddr;
551 struct __rt6_probe_work *work =
552 container_of(w, struct __rt6_probe_work, work);
554 addrconf_addr_solict_mult(&work->target, &mcaddr);
555 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
560 static void rt6_probe(struct rt6_info *rt)
562 struct __rt6_probe_work *work;
563 struct neighbour *neigh;
565 * Okay, this does not seem to be appropriate
566 * for now, however, we need to check if it
567 * is really so; aka Router Reachability Probing.
569 * Router Reachability Probe MUST be rate-limited
570 * to no more than one per minute.
572 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
575 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
577 if (neigh->nud_state & NUD_VALID)
581 write_lock(&neigh->lock);
582 if (!(neigh->nud_state & NUD_VALID) &&
585 rt->rt6i_idev->cnf.rtr_probe_interval)) {
586 work = kmalloc(sizeof(*work), GFP_ATOMIC);
588 __neigh_set_probe_once(neigh);
590 write_unlock(&neigh->lock);
592 work = kmalloc(sizeof(*work), GFP_ATOMIC);
596 INIT_WORK(&work->work, rt6_probe_deferred);
597 work->target = rt->rt6i_gateway;
598 dev_hold(rt->dst.dev);
599 work->dev = rt->dst.dev;
600 schedule_work(&work->work);
604 rcu_read_unlock_bh();
607 static inline void rt6_probe(struct rt6_info *rt)
613 * Default Router Selection (RFC 2461 6.3.6)
615 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
617 struct net_device *dev = rt->dst.dev;
618 if (!oif || dev->ifindex == oif)
620 if ((dev->flags & IFF_LOOPBACK) &&
621 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
626 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
628 struct neighbour *neigh;
629 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
631 if (rt->rt6i_flags & RTF_NONEXTHOP ||
632 !(rt->rt6i_flags & RTF_GATEWAY))
633 return RT6_NUD_SUCCEED;
636 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
638 read_lock(&neigh->lock);
639 if (neigh->nud_state & NUD_VALID)
640 ret = RT6_NUD_SUCCEED;
641 #ifdef CONFIG_IPV6_ROUTER_PREF
642 else if (!(neigh->nud_state & NUD_FAILED))
643 ret = RT6_NUD_SUCCEED;
645 ret = RT6_NUD_FAIL_PROBE;
647 read_unlock(&neigh->lock);
649 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
650 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
652 rcu_read_unlock_bh();
657 static int rt6_score_route(struct rt6_info *rt, int oif,
662 m = rt6_check_dev(rt, oif);
663 if (!m && (strict & RT6_LOOKUP_F_IFACE))
664 return RT6_NUD_FAIL_HARD;
665 #ifdef CONFIG_IPV6_ROUTER_PREF
666 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
668 if (strict & RT6_LOOKUP_F_REACHABLE) {
669 int n = rt6_check_neigh(rt);
676 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
677 int *mpri, struct rt6_info *match,
681 bool match_do_rr = false;
682 struct inet6_dev *idev = rt->rt6i_idev;
683 struct net_device *dev = rt->dst.dev;
685 if (dev && !netif_carrier_ok(dev) &&
686 idev->cnf.ignore_routes_with_linkdown &&
687 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
690 if (rt6_check_expired(rt))
693 m = rt6_score_route(rt, oif, strict);
694 if (m == RT6_NUD_FAIL_DO_RR) {
696 m = 0; /* lowest valid score */
697 } else if (m == RT6_NUD_FAIL_HARD) {
701 if (strict & RT6_LOOKUP_F_REACHABLE)
704 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
706 *do_rr = match_do_rr;
714 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
715 struct rt6_info *leaf,
716 struct rt6_info *rr_head,
717 u32 metric, int oif, int strict,
720 struct rt6_info *rt, *match, *cont;
725 for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
726 if (rt->rt6i_metric != metric) {
731 match = find_match(rt, oif, strict, &mpri, match, do_rr);
734 for (rt = leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
735 if (rt->rt6i_metric != metric) {
740 match = find_match(rt, oif, strict, &mpri, match, do_rr);
746 for (rt = cont; rt; rt = rt->dst.rt6_next)
747 match = find_match(rt, oif, strict, &mpri, match, do_rr);
752 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
755 struct rt6_info *leaf = fn->leaf;
756 struct rt6_info *match, *rt0;
760 return net->ipv6.ip6_null_entry;
764 fn->rr_ptr = rt0 = leaf;
766 match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
770 struct rt6_info *next = rt0->dst.rt6_next;
772 /* no entries matched; do round-robin */
773 if (!next || next->rt6i_metric != rt0->rt6i_metric)
780 return match ? match : net->ipv6.ip6_null_entry;
783 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
785 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790 const struct in6_addr *gwaddr)
792 struct net *net = dev_net(dev);
793 struct route_info *rinfo = (struct route_info *) opt;
794 struct in6_addr prefix_buf, *prefix;
796 unsigned long lifetime;
799 if (len < sizeof(struct route_info)) {
803 /* Sanity check for prefix_len and length */
804 if (rinfo->length > 3) {
806 } else if (rinfo->prefix_len > 128) {
808 } else if (rinfo->prefix_len > 64) {
809 if (rinfo->length < 2) {
812 } else if (rinfo->prefix_len > 0) {
813 if (rinfo->length < 1) {
818 pref = rinfo->route_pref;
819 if (pref == ICMPV6_ROUTER_PREF_INVALID)
822 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
824 if (rinfo->length == 3)
825 prefix = (struct in6_addr *)rinfo->prefix;
827 /* this function is safe */
828 ipv6_addr_prefix(&prefix_buf,
829 (struct in6_addr *)rinfo->prefix,
831 prefix = &prefix_buf;
834 if (rinfo->prefix_len == 0)
835 rt = rt6_get_dflt_router(gwaddr, dev);
837 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
840 if (rt && !lifetime) {
846 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
849 rt->rt6i_flags = RTF_ROUTEINFO |
850 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
853 if (!addrconf_finite_timeout(lifetime))
854 rt6_clean_expires(rt);
856 rt6_set_expires(rt, jiffies + HZ * lifetime);
864 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
865 struct in6_addr *saddr)
867 struct fib6_node *pn;
869 if (fn->fn_flags & RTN_TL_ROOT)
872 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
873 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
876 if (fn->fn_flags & RTN_RTINFO)
881 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
884 struct rt6_info *rt = *prt;
886 if (dst_hold_safe(&rt->dst))
889 rt = net->ipv6.ip6_null_entry;
898 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
899 struct fib6_table *table,
900 struct flowi6 *fl6, int flags)
902 struct rt6_info *rt, *rt_cache;
903 struct fib6_node *fn;
905 read_lock_bh(&table->tb6_lock);
906 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
909 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
910 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
911 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
912 if (rt == net->ipv6.ip6_null_entry) {
913 fn = fib6_backtrack(fn, &fl6->saddr);
917 /* Search through exception table */
918 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
922 if (ip6_hold_safe(net, &rt, true))
923 dst_use_noref(&rt->dst, jiffies);
925 read_unlock_bh(&table->tb6_lock);
927 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
933 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
936 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
938 EXPORT_SYMBOL_GPL(ip6_route_lookup);
940 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
941 const struct in6_addr *saddr, int oif, int strict)
943 struct flowi6 fl6 = {
947 struct dst_entry *dst;
948 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
951 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
952 flags |= RT6_LOOKUP_F_HAS_SADDR;
955 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
957 return (struct rt6_info *) dst;
963 EXPORT_SYMBOL(rt6_lookup);
965 /* ip6_ins_rt is called with FREE table->tb6_lock.
966 * It takes new route entry, the addition fails by any reason the
968 * Caller must hold dst before calling it.
971 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
972 struct mx6_config *mxc,
973 struct netlink_ext_ack *extack)
976 struct fib6_table *table;
978 table = rt->rt6i_table;
979 write_lock_bh(&table->tb6_lock);
980 err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
981 write_unlock_bh(&table->tb6_lock);
986 int ip6_ins_rt(struct rt6_info *rt)
988 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
989 struct mx6_config mxc = { .mx = NULL, };
991 /* Hold dst to account for the reference from the fib6 tree */
993 return __ip6_ins_rt(rt, &info, &mxc, NULL);
996 /* called with rcu_lock held */
997 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
999 struct net_device *dev = rt->dst.dev;
1001 if (rt->rt6i_flags & RTF_LOCAL) {
1002 /* for copies of local routes, dst->dev needs to be the
1003 * device if it is a master device, the master device if
1004 * device is enslaved, and the loopback as the default
1006 if (netif_is_l3_slave(dev) &&
1007 !rt6_need_strict(&rt->rt6i_dst.addr))
1008 dev = l3mdev_master_dev_rcu(dev);
1009 else if (!netif_is_l3_master(dev))
1010 dev = dev_net(dev)->loopback_dev;
1011 /* last case is netif_is_l3_master(dev) is true in which
1012 * case we want dev returned to be dev
1019 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1020 const struct in6_addr *daddr,
1021 const struct in6_addr *saddr)
1023 struct net_device *dev;
1024 struct rt6_info *rt;
1030 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1031 ort = (struct rt6_info *)ort->dst.from;
1034 dev = ip6_rt_get_dev_rcu(ort);
1035 rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1040 ip6_rt_copy_init(rt, ort);
1041 rt->rt6i_flags |= RTF_CACHE;
1042 rt->rt6i_metric = 0;
1043 rt->dst.flags |= DST_HOST;
1044 rt->rt6i_dst.addr = *daddr;
1045 rt->rt6i_dst.plen = 128;
1047 if (!rt6_is_gw_or_nonexthop(ort)) {
1048 if (ort->rt6i_dst.plen != 128 &&
1049 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1050 rt->rt6i_flags |= RTF_ANYCAST;
1051 #ifdef CONFIG_IPV6_SUBTREES
1052 if (rt->rt6i_src.plen && saddr) {
1053 rt->rt6i_src.addr = *saddr;
1054 rt->rt6i_src.plen = 128;
1062 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1064 struct net_device *dev;
1065 struct rt6_info *pcpu_rt;
1068 dev = ip6_rt_get_dev_rcu(rt);
1069 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1073 ip6_rt_copy_init(pcpu_rt, rt);
1074 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1075 pcpu_rt->rt6i_flags |= RTF_PCPU;
1079 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1080 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1082 struct rt6_info *pcpu_rt, **p;
1084 p = this_cpu_ptr(rt->rt6i_pcpu);
1087 if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1088 rt6_dst_from_metrics_check(pcpu_rt);
1093 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1095 struct rt6_info *pcpu_rt, *prev, **p;
1097 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1099 struct net *net = dev_net(rt->dst.dev);
1101 dst_hold(&net->ipv6.ip6_null_entry->dst);
1102 return net->ipv6.ip6_null_entry;
1105 dst_hold(&pcpu_rt->dst);
1106 p = this_cpu_ptr(rt->rt6i_pcpu);
1107 prev = cmpxchg(p, NULL, pcpu_rt);
1109 /* If someone did it before us, return prev instead */
1110 /* release refcnt taken by ip6_rt_pcpu_alloc() */
1111 dst_release_immediate(&pcpu_rt->dst);
1112 /* release refcnt taken by above dst_hold() */
1113 dst_release_immediate(&pcpu_rt->dst);
1114 dst_hold(&prev->dst);
1118 rt6_dst_from_metrics_check(pcpu_rt);
1122 /* exception hash table implementation
1124 static DEFINE_SPINLOCK(rt6_exception_lock);
1126 /* Remove rt6_ex from hash table and free the memory
1127 * Caller must hold rt6_exception_lock
1129 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1130 struct rt6_exception *rt6_ex)
1132 if (!bucket || !rt6_ex)
1134 rt6_ex->rt6i->rt6i_node = NULL;
1135 hlist_del_rcu(&rt6_ex->hlist);
1136 rt6_release(rt6_ex->rt6i);
1137 kfree_rcu(rt6_ex, rcu);
1138 WARN_ON_ONCE(!bucket->depth);
1142 /* Remove oldest rt6_ex in bucket and free the memory
1143 * Caller must hold rt6_exception_lock
1145 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1147 struct rt6_exception *rt6_ex, *oldest = NULL;
1152 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1153 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1156 rt6_remove_exception(bucket, oldest);
1159 static u32 rt6_exception_hash(const struct in6_addr *dst,
1160 const struct in6_addr *src)
1162 static u32 seed __read_mostly;
1165 net_get_random_once(&seed, sizeof(seed));
1166 val = jhash(dst, sizeof(*dst), seed);
1168 #ifdef CONFIG_IPV6_SUBTREES
1170 val = jhash(src, sizeof(*src), val);
1172 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1175 /* Helper function to find the cached rt in the hash table
1176 * and update bucket pointer to point to the bucket for this
1177 * (daddr, saddr) pair
1178 * Caller must hold rt6_exception_lock
1180 static struct rt6_exception *
1181 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1182 const struct in6_addr *daddr,
1183 const struct in6_addr *saddr)
1185 struct rt6_exception *rt6_ex;
1188 if (!(*bucket) || !daddr)
1191 hval = rt6_exception_hash(daddr, saddr);
1194 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1195 struct rt6_info *rt6 = rt6_ex->rt6i;
1196 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1198 #ifdef CONFIG_IPV6_SUBTREES
1199 if (matched && saddr)
1200 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1208 /* Helper function to find the cached rt in the hash table
1209 * and update bucket pointer to point to the bucket for this
1210 * (daddr, saddr) pair
1211 * Caller must hold rcu_read_lock()
1213 static struct rt6_exception *
1214 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1215 const struct in6_addr *daddr,
1216 const struct in6_addr *saddr)
1218 struct rt6_exception *rt6_ex;
1221 WARN_ON_ONCE(!rcu_read_lock_held());
1223 if (!(*bucket) || !daddr)
1226 hval = rt6_exception_hash(daddr, saddr);
1229 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1230 struct rt6_info *rt6 = rt6_ex->rt6i;
1231 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1233 #ifdef CONFIG_IPV6_SUBTREES
1234 if (matched && saddr)
1235 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1243 static int rt6_insert_exception(struct rt6_info *nrt,
1244 struct rt6_info *ort)
1246 struct rt6_exception_bucket *bucket;
1247 struct in6_addr *src_key = NULL;
1248 struct rt6_exception *rt6_ex;
1251 /* ort can't be a cache or pcpu route */
1252 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1253 ort = (struct rt6_info *)ort->dst.from;
1254 WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1256 spin_lock_bh(&rt6_exception_lock);
1258 if (ort->exception_bucket_flushed) {
1263 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1264 lockdep_is_held(&rt6_exception_lock));
1266 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1272 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1275 #ifdef CONFIG_IPV6_SUBTREES
1276 /* rt6i_src.plen != 0 indicates ort is in subtree
1277 * and exception table is indexed by a hash of
1278 * both rt6i_dst and rt6i_src.
1279 * Otherwise, the exception table is indexed by
1280 * a hash of only rt6i_dst.
1282 if (ort->rt6i_src.plen)
1283 src_key = &nrt->rt6i_src.addr;
1286 /* Update rt6i_prefsrc as it could be changed
1287 * in rt6_remove_prefsrc()
1289 nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1290 /* rt6_mtu_change() might lower mtu on ort.
1291 * Only insert this exception route if its mtu
1292 * is less than ort's mtu value.
1294 if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1299 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1302 rt6_remove_exception(bucket, rt6_ex);
1304 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1310 rt6_ex->stamp = jiffies;
1311 atomic_inc(&nrt->rt6i_ref);
1312 nrt->rt6i_node = ort->rt6i_node;
1313 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1316 if (bucket->depth > FIB6_MAX_DEPTH)
1317 rt6_exception_remove_oldest(bucket);
1320 spin_unlock_bh(&rt6_exception_lock);
1322 /* Update fn->fn_sernum to invalidate all cached dst */
1324 fib6_update_sernum(ort);
1329 void rt6_flush_exceptions(struct rt6_info *rt)
1331 struct rt6_exception_bucket *bucket;
1332 struct rt6_exception *rt6_ex;
1333 struct hlist_node *tmp;
1336 spin_lock_bh(&rt6_exception_lock);
1337 /* Prevent rt6_insert_exception() to recreate the bucket list */
1338 rt->exception_bucket_flushed = 1;
1340 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1341 lockdep_is_held(&rt6_exception_lock));
1345 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1346 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1347 rt6_remove_exception(bucket, rt6_ex);
1348 WARN_ON_ONCE(bucket->depth);
1353 spin_unlock_bh(&rt6_exception_lock);
1356 /* Find cached rt in the hash table inside passed in rt
1357 * Caller has to hold rcu_read_lock()
1359 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1360 struct in6_addr *daddr,
1361 struct in6_addr *saddr)
1363 struct rt6_exception_bucket *bucket;
1364 struct in6_addr *src_key = NULL;
1365 struct rt6_exception *rt6_ex;
1366 struct rt6_info *res = NULL;
1368 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1370 #ifdef CONFIG_IPV6_SUBTREES
1371 /* rt6i_src.plen != 0 indicates rt is in subtree
1372 * and exception table is indexed by a hash of
1373 * both rt6i_dst and rt6i_src.
1374 * Otherwise, the exception table is indexed by
1375 * a hash of only rt6i_dst.
1377 if (rt->rt6i_src.plen)
1380 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1382 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1388 /* Remove the passed in cached rt from the hash table that contains it */
1389 int rt6_remove_exception_rt(struct rt6_info *rt)
1391 struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1392 struct rt6_exception_bucket *bucket;
1393 struct in6_addr *src_key = NULL;
1394 struct rt6_exception *rt6_ex;
1398 !(rt->rt6i_flags | RTF_CACHE))
1401 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1404 spin_lock_bh(&rt6_exception_lock);
1405 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1406 lockdep_is_held(&rt6_exception_lock));
1407 #ifdef CONFIG_IPV6_SUBTREES
1408 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1409 * and exception table is indexed by a hash of
1410 * both rt6i_dst and rt6i_src.
1411 * Otherwise, the exception table is indexed by
1412 * a hash of only rt6i_dst.
1414 if (from->rt6i_src.plen)
1415 src_key = &rt->rt6i_src.addr;
1417 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1421 rt6_remove_exception(bucket, rt6_ex);
1427 spin_unlock_bh(&rt6_exception_lock);
1431 /* Find rt6_ex which contains the passed in rt cache and
1434 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1436 struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1437 struct rt6_exception_bucket *bucket;
1438 struct in6_addr *src_key = NULL;
1439 struct rt6_exception *rt6_ex;
1442 !(rt->rt6i_flags | RTF_CACHE))
1446 bucket = rcu_dereference(from->rt6i_exception_bucket);
1448 #ifdef CONFIG_IPV6_SUBTREES
1449 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1450 * and exception table is indexed by a hash of
1451 * both rt6i_dst and rt6i_src.
1452 * Otherwise, the exception table is indexed by
1453 * a hash of only rt6i_dst.
1455 if (from->rt6i_src.plen)
1456 src_key = &rt->rt6i_src.addr;
1458 rt6_ex = __rt6_find_exception_rcu(&bucket,
1462 rt6_ex->stamp = jiffies;
1467 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1469 struct rt6_exception_bucket *bucket;
1470 struct rt6_exception *rt6_ex;
1473 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1474 lockdep_is_held(&rt6_exception_lock));
1477 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1478 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1479 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1486 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1488 struct rt6_exception_bucket *bucket;
1489 struct rt6_exception *rt6_ex;
1492 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1493 lockdep_is_held(&rt6_exception_lock));
1496 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1497 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1498 struct rt6_info *entry = rt6_ex->rt6i;
1499 /* For RTF_CACHE with rt6i_pmtu == 0
1500 * (i.e. a redirected route),
1501 * the metrics of its rt->dst.from has already
1504 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1505 entry->rt6i_pmtu = mtu;
1512 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1514 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1515 struct in6_addr *gateway)
1517 struct rt6_exception_bucket *bucket;
1518 struct rt6_exception *rt6_ex;
1519 struct hlist_node *tmp;
1522 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1525 spin_lock_bh(&rt6_exception_lock);
1526 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1527 lockdep_is_held(&rt6_exception_lock));
1530 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1531 hlist_for_each_entry_safe(rt6_ex, tmp,
1532 &bucket->chain, hlist) {
1533 struct rt6_info *entry = rt6_ex->rt6i;
1535 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1536 RTF_CACHE_GATEWAY &&
1537 ipv6_addr_equal(gateway,
1538 &entry->rt6i_gateway)) {
1539 rt6_remove_exception(bucket, rt6_ex);
1546 spin_unlock_bh(&rt6_exception_lock);
1549 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1550 struct rt6_exception *rt6_ex,
1551 struct fib6_gc_args *gc_args,
1554 struct rt6_info *rt = rt6_ex->rt6i;
1556 if (atomic_read(&rt->dst.__refcnt) == 1 &&
1557 time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1558 RT6_TRACE("aging clone %p\n", rt);
1559 rt6_remove_exception(bucket, rt6_ex);
1561 } else if (rt->rt6i_flags & RTF_GATEWAY) {
1562 struct neighbour *neigh;
1563 __u8 neigh_flags = 0;
1565 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1567 neigh_flags = neigh->flags;
1568 neigh_release(neigh);
1570 if (!(neigh_flags & NTF_ROUTER)) {
1571 RT6_TRACE("purging route %p via non-router but gateway\n",
1573 rt6_remove_exception(bucket, rt6_ex);
1580 void rt6_age_exceptions(struct rt6_info *rt,
1581 struct fib6_gc_args *gc_args,
1584 struct rt6_exception_bucket *bucket;
1585 struct rt6_exception *rt6_ex;
1586 struct hlist_node *tmp;
1589 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1592 spin_lock_bh(&rt6_exception_lock);
1593 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1594 lockdep_is_held(&rt6_exception_lock));
1597 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1598 hlist_for_each_entry_safe(rt6_ex, tmp,
1599 &bucket->chain, hlist) {
1600 rt6_age_examine_exception(bucket, rt6_ex,
1606 spin_unlock_bh(&rt6_exception_lock);
1609 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1610 int oif, struct flowi6 *fl6, int flags)
1612 struct fib6_node *fn, *saved_fn;
1613 struct rt6_info *rt, *rt_cache;
1616 strict |= flags & RT6_LOOKUP_F_IFACE;
1617 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1618 if (net->ipv6.devconf_all->forwarding == 0)
1619 strict |= RT6_LOOKUP_F_REACHABLE;
1621 read_lock_bh(&table->tb6_lock);
1623 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1626 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1630 rt = rt6_select(net, fn, oif, strict);
1631 if (rt->rt6i_nsiblings)
1632 rt = rt6_multipath_select(rt, fl6, oif, strict);
1633 if (rt == net->ipv6.ip6_null_entry) {
1634 fn = fib6_backtrack(fn, &fl6->saddr);
1636 goto redo_rt6_select;
1637 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1638 /* also consider unreachable route */
1639 strict &= ~RT6_LOOKUP_F_REACHABLE;
1641 goto redo_rt6_select;
1645 /*Search through exception table */
1646 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1650 if (rt == net->ipv6.ip6_null_entry) {
1651 read_unlock_bh(&table->tb6_lock);
1653 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1655 } else if (rt->rt6i_flags & RTF_CACHE) {
1656 if (ip6_hold_safe(net, &rt, true)) {
1657 dst_use_noref(&rt->dst, jiffies);
1658 rt6_dst_from_metrics_check(rt);
1660 read_unlock_bh(&table->tb6_lock);
1661 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1663 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1664 !(rt->rt6i_flags & RTF_GATEWAY))) {
1665 /* Create a RTF_CACHE clone which will not be
1666 * owned by the fib6 tree. It is for the special case where
1667 * the daddr in the skb during the neighbor look-up is different
1668 * from the fl6->daddr used to look-up route here.
1671 struct rt6_info *uncached_rt;
1673 if (ip6_hold_safe(net, &rt, true)) {
1674 dst_use_noref(&rt->dst, jiffies);
1676 read_unlock_bh(&table->tb6_lock);
1678 goto uncached_rt_out;
1680 read_unlock_bh(&table->tb6_lock);
1682 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1683 dst_release(&rt->dst);
1686 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1687 * No need for another dst_hold()
1689 rt6_uncached_list_add(uncached_rt);
1691 uncached_rt = net->ipv6.ip6_null_entry;
1692 dst_hold(&uncached_rt->dst);
1696 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1700 /* Get a percpu copy */
1702 struct rt6_info *pcpu_rt;
1704 dst_use_noref(&rt->dst, jiffies);
1705 pcpu_rt = rt6_get_pcpu_route(rt);
1708 read_unlock_bh(&table->tb6_lock);
1710 /* atomic_inc_not_zero() is needed when using rcu */
1711 if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1712 /* We have to do the read_unlock first
1713 * because rt6_make_pcpu_route() may trigger
1714 * ip6_dst_gc() which will take the write_lock.
1716 * No dst_hold() on rt is needed because grabbing
1717 * rt->rt6i_ref makes sure rt can't be released.
1719 read_unlock_bh(&table->tb6_lock);
1720 pcpu_rt = rt6_make_pcpu_route(rt);
1723 /* rt is already removed from tree */
1724 read_unlock_bh(&table->tb6_lock);
1725 pcpu_rt = net->ipv6.ip6_null_entry;
1726 dst_hold(&pcpu_rt->dst);
1730 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1734 EXPORT_SYMBOL_GPL(ip6_pol_route);
1736 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1737 struct flowi6 *fl6, int flags)
1739 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1742 struct dst_entry *ip6_route_input_lookup(struct net *net,
1743 struct net_device *dev,
1744 struct flowi6 *fl6, int flags)
1746 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1747 flags |= RT6_LOOKUP_F_IFACE;
1749 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1751 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1753 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1754 struct flow_keys *keys)
1756 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1757 const struct ipv6hdr *key_iph = outer_iph;
1758 const struct ipv6hdr *inner_iph;
1759 const struct icmp6hdr *icmph;
1760 struct ipv6hdr _inner_iph;
1762 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1765 icmph = icmp6_hdr(skb);
1766 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1767 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1768 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1769 icmph->icmp6_type != ICMPV6_PARAMPROB)
1772 inner_iph = skb_header_pointer(skb,
1773 skb_transport_offset(skb) + sizeof(*icmph),
1774 sizeof(_inner_iph), &_inner_iph);
1778 key_iph = inner_iph;
1780 memset(keys, 0, sizeof(*keys));
1781 keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1782 keys->addrs.v6addrs.src = key_iph->saddr;
1783 keys->addrs.v6addrs.dst = key_iph->daddr;
1784 keys->tags.flow_label = ip6_flowinfo(key_iph);
1785 keys->basic.ip_proto = key_iph->nexthdr;
1788 /* if skb is set it will be used and fl6 can be NULL */
1789 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1791 struct flow_keys hash_keys;
1794 ip6_multipath_l3_keys(skb, &hash_keys);
1795 return flow_hash_from_keys(&hash_keys);
1798 return get_hash_from_flowi6(fl6);
1801 void ip6_route_input(struct sk_buff *skb)
1803 const struct ipv6hdr *iph = ipv6_hdr(skb);
1804 struct net *net = dev_net(skb->dev);
1805 int flags = RT6_LOOKUP_F_HAS_SADDR;
1806 struct ip_tunnel_info *tun_info;
1807 struct flowi6 fl6 = {
1808 .flowi6_iif = skb->dev->ifindex,
1809 .daddr = iph->daddr,
1810 .saddr = iph->saddr,
1811 .flowlabel = ip6_flowinfo(iph),
1812 .flowi6_mark = skb->mark,
1813 .flowi6_proto = iph->nexthdr,
1816 tun_info = skb_tunnel_info(skb);
1817 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1818 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1819 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1820 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1822 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1825 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1826 struct flowi6 *fl6, int flags)
1828 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1831 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1832 struct flowi6 *fl6, int flags)
1836 if (rt6_need_strict(&fl6->daddr)) {
1837 struct dst_entry *dst;
1839 dst = l3mdev_link_scope_lookup(net, fl6);
1844 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1846 any_src = ipv6_addr_any(&fl6->saddr);
1847 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1848 (fl6->flowi6_oif && any_src))
1849 flags |= RT6_LOOKUP_F_IFACE;
1852 flags |= RT6_LOOKUP_F_HAS_SADDR;
1854 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1856 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1858 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1860 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1862 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1863 struct net_device *loopback_dev = net->loopback_dev;
1864 struct dst_entry *new = NULL;
1866 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1867 DST_OBSOLETE_NONE, 0);
1873 new->input = dst_discard;
1874 new->output = dst_discard_out;
1876 dst_copy_metrics(new, &ort->dst);
1878 rt->rt6i_idev = in6_dev_get(loopback_dev);
1879 rt->rt6i_gateway = ort->rt6i_gateway;
1880 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1881 rt->rt6i_metric = 0;
1883 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1884 #ifdef CONFIG_IPV6_SUBTREES
1885 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1889 dst_release(dst_orig);
1890 return new ? new : ERR_PTR(-ENOMEM);
1894 * Destination cache support functions
1897 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1900 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1901 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1904 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1908 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1911 if (rt6_check_expired(rt))
1917 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1919 if (!__rt6_check_expired(rt) &&
1920 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1921 rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1927 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1929 struct rt6_info *rt;
1931 rt = (struct rt6_info *) dst;
1933 /* All IPV6 dsts are created with ->obsolete set to the value
1934 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1935 * into this function always.
1938 rt6_dst_from_metrics_check(rt);
1940 if (rt->rt6i_flags & RTF_PCPU ||
1941 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1942 return rt6_dst_from_check(rt, cookie);
1944 return rt6_check(rt, cookie);
1947 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1949 struct rt6_info *rt = (struct rt6_info *) dst;
1952 if (rt->rt6i_flags & RTF_CACHE) {
1953 if (rt6_check_expired(rt)) {
1965 static void ip6_link_failure(struct sk_buff *skb)
1967 struct rt6_info *rt;
1969 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1971 rt = (struct rt6_info *) skb_dst(skb);
1973 if (rt->rt6i_flags & RTF_CACHE) {
1974 if (dst_hold_safe(&rt->dst))
1977 struct fib6_node *fn;
1980 fn = rcu_dereference(rt->rt6i_node);
1981 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1988 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1990 struct net *net = dev_net(rt->dst.dev);
1992 rt->rt6i_flags |= RTF_MODIFIED;
1993 rt->rt6i_pmtu = mtu;
1994 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1997 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1999 return !(rt->rt6i_flags & RTF_CACHE) &&
2000 (rt->rt6i_flags & RTF_PCPU ||
2001 rcu_access_pointer(rt->rt6i_node));
2004 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2005 const struct ipv6hdr *iph, u32 mtu)
2007 const struct in6_addr *daddr, *saddr;
2008 struct rt6_info *rt6 = (struct rt6_info *)dst;
2010 if (rt6->rt6i_flags & RTF_LOCAL)
2013 if (dst_metric_locked(dst, RTAX_MTU))
2017 daddr = &iph->daddr;
2018 saddr = &iph->saddr;
2020 daddr = &sk->sk_v6_daddr;
2021 saddr = &inet6_sk(sk)->saddr;
2026 dst_confirm_neigh(dst, daddr);
2027 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2028 if (mtu >= dst_mtu(dst))
2031 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2032 rt6_do_update_pmtu(rt6, mtu);
2033 /* update rt6_ex->stamp for cache */
2034 if (rt6->rt6i_flags & RTF_CACHE)
2035 rt6_update_exception_stamp_rt(rt6);
2037 struct rt6_info *nrt6;
2039 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2041 rt6_do_update_pmtu(nrt6, mtu);
2042 if (rt6_insert_exception(nrt6, rt6))
2043 dst_release_immediate(&nrt6->dst);
2048 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2049 struct sk_buff *skb, u32 mtu)
2051 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2054 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2055 int oif, u32 mark, kuid_t uid)
2057 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2058 struct dst_entry *dst;
2061 memset(&fl6, 0, sizeof(fl6));
2062 fl6.flowi6_oif = oif;
2063 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2064 fl6.daddr = iph->daddr;
2065 fl6.saddr = iph->saddr;
2066 fl6.flowlabel = ip6_flowinfo(iph);
2067 fl6.flowi6_uid = uid;
2069 dst = ip6_route_output(net, NULL, &fl6);
2071 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2074 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2076 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2078 struct dst_entry *dst;
2080 ip6_update_pmtu(skb, sock_net(sk), mtu,
2081 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2083 dst = __sk_dst_get(sk);
2084 if (!dst || !dst->obsolete ||
2085 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2089 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2090 ip6_datagram_dst_update(sk, false);
2093 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2095 /* Handle redirects */
2096 struct ip6rd_flowi {
2098 struct in6_addr gateway;
2101 static struct rt6_info *__ip6_route_redirect(struct net *net,
2102 struct fib6_table *table,
2106 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2107 struct rt6_info *rt, *rt_cache;
2108 struct fib6_node *fn;
2110 /* Get the "current" route for this destination and
2111 * check if the redirect has come from appropriate router.
2113 * RFC 4861 specifies that redirects should only be
2114 * accepted if they come from the nexthop to the target.
2115 * Due to the way the routes are chosen, this notion
2116 * is a bit fuzzy and one might need to check all possible
2120 read_lock_bh(&table->tb6_lock);
2121 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2123 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2124 if (rt6_check_expired(rt))
2128 if (!(rt->rt6i_flags & RTF_GATEWAY))
2130 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2132 /* rt_cache's gateway might be different from its 'parent'
2133 * in the case of an ip redirect.
2134 * So we keep searching in the exception table if the gateway
2137 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2138 rt_cache = rt6_find_cached_rt(rt,
2142 ipv6_addr_equal(&rdfl->gateway,
2143 &rt_cache->rt6i_gateway)) {
2153 rt = net->ipv6.ip6_null_entry;
2154 else if (rt->dst.error) {
2155 rt = net->ipv6.ip6_null_entry;
2159 if (rt == net->ipv6.ip6_null_entry) {
2160 fn = fib6_backtrack(fn, &fl6->saddr);
2166 ip6_hold_safe(net, &rt, true);
2168 read_unlock_bh(&table->tb6_lock);
2170 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
2174 static struct dst_entry *ip6_route_redirect(struct net *net,
2175 const struct flowi6 *fl6,
2176 const struct in6_addr *gateway)
2178 int flags = RT6_LOOKUP_F_HAS_SADDR;
2179 struct ip6rd_flowi rdfl;
2182 rdfl.gateway = *gateway;
2184 return fib6_rule_lookup(net, &rdfl.fl6,
2185 flags, __ip6_route_redirect);
2188 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2191 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2192 struct dst_entry *dst;
2195 memset(&fl6, 0, sizeof(fl6));
2196 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2197 fl6.flowi6_oif = oif;
2198 fl6.flowi6_mark = mark;
2199 fl6.daddr = iph->daddr;
2200 fl6.saddr = iph->saddr;
2201 fl6.flowlabel = ip6_flowinfo(iph);
2202 fl6.flowi6_uid = uid;
2204 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2205 rt6_do_redirect(dst, NULL, skb);
2208 EXPORT_SYMBOL_GPL(ip6_redirect);
2210 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2213 const struct ipv6hdr *iph = ipv6_hdr(skb);
2214 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2215 struct dst_entry *dst;
2218 memset(&fl6, 0, sizeof(fl6));
2219 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2220 fl6.flowi6_oif = oif;
2221 fl6.flowi6_mark = mark;
2222 fl6.daddr = msg->dest;
2223 fl6.saddr = iph->daddr;
2224 fl6.flowi6_uid = sock_net_uid(net, NULL);
2226 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2227 rt6_do_redirect(dst, NULL, skb);
2231 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2233 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2236 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2238 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2240 struct net_device *dev = dst->dev;
2241 unsigned int mtu = dst_mtu(dst);
2242 struct net *net = dev_net(dev);
2244 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2246 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2247 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2250 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2251 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2252 * IPV6_MAXPLEN is also valid and means: "any MSS,
2253 * rely only on pmtu discovery"
2255 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2260 static unsigned int ip6_mtu(const struct dst_entry *dst)
2262 const struct rt6_info *rt = (const struct rt6_info *)dst;
2263 unsigned int mtu = rt->rt6i_pmtu;
2264 struct inet6_dev *idev;
2269 mtu = dst_metric_raw(dst, RTAX_MTU);
2276 idev = __in6_dev_get(dst->dev);
2278 mtu = idev->cnf.mtu6;
2282 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2284 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2287 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2290 struct dst_entry *dst;
2291 struct rt6_info *rt;
2292 struct inet6_dev *idev = in6_dev_get(dev);
2293 struct net *net = dev_net(dev);
2295 if (unlikely(!idev))
2296 return ERR_PTR(-ENODEV);
2298 rt = ip6_dst_alloc(net, dev, 0);
2299 if (unlikely(!rt)) {
2301 dst = ERR_PTR(-ENOMEM);
2305 rt->dst.flags |= DST_HOST;
2306 rt->dst.output = ip6_output;
2307 rt->rt6i_gateway = fl6->daddr;
2308 rt->rt6i_dst.addr = fl6->daddr;
2309 rt->rt6i_dst.plen = 128;
2310 rt->rt6i_idev = idev;
2311 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2313 /* Add this dst into uncached_list so that rt6_ifdown() can
2314 * do proper release of the net_device
2316 rt6_uncached_list_add(rt);
2318 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2324 static int ip6_dst_gc(struct dst_ops *ops)
2326 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2327 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2328 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2329 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2330 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2331 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2334 entries = dst_entries_get_fast(ops);
2335 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2336 entries <= rt_max_size)
2339 net->ipv6.ip6_rt_gc_expire++;
2340 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2341 entries = dst_entries_get_slow(ops);
2342 if (entries < ops->gc_thresh)
2343 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2345 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2346 return entries > rt_max_size;
2349 static int ip6_convert_metrics(struct mx6_config *mxc,
2350 const struct fib6_config *cfg)
2352 bool ecn_ca = false;
2360 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2364 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2365 int type = nla_type(nla);
2370 if (unlikely(type > RTAX_MAX))
2373 if (type == RTAX_CC_ALGO) {
2374 char tmp[TCP_CA_NAME_MAX];
2376 nla_strlcpy(tmp, nla, sizeof(tmp));
2377 val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
2378 if (val == TCP_CA_UNSPEC)
2381 val = nla_get_u32(nla);
2383 if (type == RTAX_HOPLIMIT && val > 255)
2385 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2389 __set_bit(type - 1, mxc->mx_valid);
2393 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2394 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2404 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2405 struct fib6_config *cfg,
2406 const struct in6_addr *gw_addr)
2408 struct flowi6 fl6 = {
2409 .flowi6_oif = cfg->fc_ifindex,
2411 .saddr = cfg->fc_prefsrc,
2413 struct fib6_table *table;
2414 struct rt6_info *rt;
2415 int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2417 table = fib6_get_table(net, cfg->fc_table);
2421 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2422 flags |= RT6_LOOKUP_F_HAS_SADDR;
2424 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2426 /* if table lookup failed, fall back to full lookup */
2427 if (rt == net->ipv6.ip6_null_entry) {
2435 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2436 struct netlink_ext_ack *extack)
2438 struct net *net = cfg->fc_nlinfo.nl_net;
2439 struct rt6_info *rt = NULL;
2440 struct net_device *dev = NULL;
2441 struct inet6_dev *idev = NULL;
2442 struct fib6_table *table;
2446 /* RTF_PCPU is an internal flag; can not be set by userspace */
2447 if (cfg->fc_flags & RTF_PCPU) {
2448 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2452 if (cfg->fc_dst_len > 128) {
2453 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2456 if (cfg->fc_src_len > 128) {
2457 NL_SET_ERR_MSG(extack, "Invalid source address length");
2460 #ifndef CONFIG_IPV6_SUBTREES
2461 if (cfg->fc_src_len) {
2462 NL_SET_ERR_MSG(extack,
2463 "Specifying source address requires IPV6_SUBTREES to be enabled");
2467 if (cfg->fc_ifindex) {
2469 dev = dev_get_by_index(net, cfg->fc_ifindex);
2472 idev = in6_dev_get(dev);
2477 if (cfg->fc_metric == 0)
2478 cfg->fc_metric = IP6_RT_PRIO_USER;
2481 if (cfg->fc_nlinfo.nlh &&
2482 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2483 table = fib6_get_table(net, cfg->fc_table);
2485 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2486 table = fib6_new_table(net, cfg->fc_table);
2489 table = fib6_new_table(net, cfg->fc_table);
2495 rt = ip6_dst_alloc(net, NULL,
2496 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2503 if (cfg->fc_flags & RTF_EXPIRES)
2504 rt6_set_expires(rt, jiffies +
2505 clock_t_to_jiffies(cfg->fc_expires));
2507 rt6_clean_expires(rt);
2509 if (cfg->fc_protocol == RTPROT_UNSPEC)
2510 cfg->fc_protocol = RTPROT_BOOT;
2511 rt->rt6i_protocol = cfg->fc_protocol;
2513 addr_type = ipv6_addr_type(&cfg->fc_dst);
2515 if (addr_type & IPV6_ADDR_MULTICAST)
2516 rt->dst.input = ip6_mc_input;
2517 else if (cfg->fc_flags & RTF_LOCAL)
2518 rt->dst.input = ip6_input;
2520 rt->dst.input = ip6_forward;
2522 rt->dst.output = ip6_output;
2524 if (cfg->fc_encap) {
2525 struct lwtunnel_state *lwtstate;
2527 err = lwtunnel_build_state(cfg->fc_encap_type,
2528 cfg->fc_encap, AF_INET6, cfg,
2532 rt->dst.lwtstate = lwtstate_get(lwtstate);
2533 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2534 rt->dst.lwtstate->orig_output = rt->dst.output;
2535 rt->dst.output = lwtunnel_output;
2537 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2538 rt->dst.lwtstate->orig_input = rt->dst.input;
2539 rt->dst.input = lwtunnel_input;
2543 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2544 rt->rt6i_dst.plen = cfg->fc_dst_len;
2545 if (rt->rt6i_dst.plen == 128)
2546 rt->dst.flags |= DST_HOST;
2548 #ifdef CONFIG_IPV6_SUBTREES
2549 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2550 rt->rt6i_src.plen = cfg->fc_src_len;
2553 rt->rt6i_metric = cfg->fc_metric;
2555 /* We cannot add true routes via loopback here,
2556 they would result in kernel looping; promote them to reject routes
2558 if ((cfg->fc_flags & RTF_REJECT) ||
2559 (dev && (dev->flags & IFF_LOOPBACK) &&
2560 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2561 !(cfg->fc_flags & RTF_LOCAL))) {
2562 /* hold loopback dev/idev if we haven't done so. */
2563 if (dev != net->loopback_dev) {
2568 dev = net->loopback_dev;
2570 idev = in6_dev_get(dev);
2576 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2577 switch (cfg->fc_type) {
2579 rt->dst.error = -EINVAL;
2580 rt->dst.output = dst_discard_out;
2581 rt->dst.input = dst_discard;
2584 rt->dst.error = -EACCES;
2585 rt->dst.output = ip6_pkt_prohibit_out;
2586 rt->dst.input = ip6_pkt_prohibit;
2589 case RTN_UNREACHABLE:
2591 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2592 : (cfg->fc_type == RTN_UNREACHABLE)
2593 ? -EHOSTUNREACH : -ENETUNREACH;
2594 rt->dst.output = ip6_pkt_discard_out;
2595 rt->dst.input = ip6_pkt_discard;
2601 if (cfg->fc_flags & RTF_GATEWAY) {
2602 const struct in6_addr *gw_addr;
2605 gw_addr = &cfg->fc_gateway;
2606 gwa_type = ipv6_addr_type(gw_addr);
2608 /* if gw_addr is local we will fail to detect this in case
2609 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2610 * will return already-added prefix route via interface that
2611 * prefix route was assigned to, which might be non-loopback.
2614 if (ipv6_chk_addr_and_flags(net, gw_addr,
2615 gwa_type & IPV6_ADDR_LINKLOCAL ?
2616 dev : NULL, 0, 0)) {
2617 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2620 rt->rt6i_gateway = *gw_addr;
2622 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2623 struct rt6_info *grt = NULL;
2625 /* IPv6 strictly inhibits using not link-local
2626 addresses as nexthop address.
2627 Otherwise, router will not able to send redirects.
2628 It is very good, but in some (rare!) circumstances
2629 (SIT, PtP, NBMA NOARP links) it is handy to allow
2630 some exceptions. --ANK
2631 We allow IPv4-mapped nexthops to support RFC4798-type
2634 if (!(gwa_type & (IPV6_ADDR_UNICAST |
2635 IPV6_ADDR_MAPPED))) {
2636 NL_SET_ERR_MSG(extack,
2637 "Invalid gateway address");
2641 if (cfg->fc_table) {
2642 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2645 if (grt->rt6i_flags & RTF_GATEWAY ||
2646 (dev && dev != grt->dst.dev)) {
2654 grt = rt6_lookup(net, gw_addr, NULL,
2655 cfg->fc_ifindex, 1);
2657 err = -EHOSTUNREACH;
2661 if (dev != grt->dst.dev) {
2667 idev = grt->rt6i_idev;
2669 in6_dev_hold(grt->rt6i_idev);
2671 if (!(grt->rt6i_flags & RTF_GATEWAY))
2680 NL_SET_ERR_MSG(extack, "Egress device not specified");
2682 } else if (dev->flags & IFF_LOOPBACK) {
2683 NL_SET_ERR_MSG(extack,
2684 "Egress device can not be loopback device for this route");
2693 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2694 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2695 NL_SET_ERR_MSG(extack, "Invalid source address");
2699 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2700 rt->rt6i_prefsrc.plen = 128;
2702 rt->rt6i_prefsrc.plen = 0;
2704 rt->rt6i_flags = cfg->fc_flags;
2708 rt->rt6i_idev = idev;
2709 rt->rt6i_table = table;
2711 cfg->fc_nlinfo.nl_net = dev_net(dev);
2720 dst_release_immediate(&rt->dst);
2722 return ERR_PTR(err);
2725 int ip6_route_add(struct fib6_config *cfg,
2726 struct netlink_ext_ack *extack)
2728 struct mx6_config mxc = { .mx = NULL, };
2729 struct rt6_info *rt;
2732 rt = ip6_route_info_create(cfg, extack);
2739 err = ip6_convert_metrics(&mxc, cfg);
2743 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2750 dst_release_immediate(&rt->dst);
2755 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2758 struct fib6_table *table;
2759 struct net *net = dev_net(rt->dst.dev);
2761 if (rt == net->ipv6.ip6_null_entry) {
2766 table = rt->rt6i_table;
2767 write_lock_bh(&table->tb6_lock);
2768 err = fib6_del(rt, info);
2769 write_unlock_bh(&table->tb6_lock);
2776 int ip6_del_rt(struct rt6_info *rt)
2778 struct nl_info info = {
2779 .nl_net = dev_net(rt->dst.dev),
2781 return __ip6_del_rt(rt, &info);
2784 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2786 struct nl_info *info = &cfg->fc_nlinfo;
2787 struct net *net = info->nl_net;
2788 struct sk_buff *skb = NULL;
2789 struct fib6_table *table;
2792 if (rt == net->ipv6.ip6_null_entry)
2794 table = rt->rt6i_table;
2795 write_lock_bh(&table->tb6_lock);
2797 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2798 struct rt6_info *sibling, *next_sibling;
2800 /* prefer to send a single notification with all hops */
2801 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2803 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2805 if (rt6_fill_node(net, skb, rt,
2806 NULL, NULL, 0, RTM_DELROUTE,
2807 info->portid, seq, 0) < 0) {
2811 info->skip_notify = 1;
2814 list_for_each_entry_safe(sibling, next_sibling,
2817 err = fib6_del(sibling, info);
2823 err = fib6_del(rt, info);
2825 write_unlock_bh(&table->tb6_lock);
2830 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2831 info->nlh, gfp_any());
2836 static int ip6_route_del(struct fib6_config *cfg,
2837 struct netlink_ext_ack *extack)
2839 struct rt6_info *rt, *rt_cache;
2840 struct fib6_table *table;
2841 struct fib6_node *fn;
2844 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2846 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2850 read_lock_bh(&table->tb6_lock);
2852 fn = fib6_locate(&table->tb6_root,
2853 &cfg->fc_dst, cfg->fc_dst_len,
2854 &cfg->fc_src, cfg->fc_src_len,
2855 !(cfg->fc_flags & RTF_CACHE));
2858 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2859 if (cfg->fc_flags & RTF_CACHE) {
2860 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2866 if (cfg->fc_ifindex &&
2868 rt->dst.dev->ifindex != cfg->fc_ifindex))
2870 if (cfg->fc_flags & RTF_GATEWAY &&
2871 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2873 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2875 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2877 if (!dst_hold_safe(&rt->dst))
2879 read_unlock_bh(&table->tb6_lock);
2881 /* if gateway was specified only delete the one hop */
2882 if (cfg->fc_flags & RTF_GATEWAY)
2883 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2885 return __ip6_del_rt_siblings(rt, cfg);
2888 read_unlock_bh(&table->tb6_lock);
2893 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2895 struct netevent_redirect netevent;
2896 struct rt6_info *rt, *nrt = NULL;
2897 struct ndisc_options ndopts;
2898 struct inet6_dev *in6_dev;
2899 struct neighbour *neigh;
2901 int optlen, on_link;
2904 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2905 optlen -= sizeof(*msg);
2908 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2912 msg = (struct rd_msg *)icmp6_hdr(skb);
2914 if (ipv6_addr_is_multicast(&msg->dest)) {
2915 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2920 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2922 } else if (ipv6_addr_type(&msg->target) !=
2923 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2924 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2928 in6_dev = __in6_dev_get(skb->dev);
2931 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2935 * The IP source address of the Redirect MUST be the same as the current
2936 * first-hop router for the specified ICMP Destination Address.
2939 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2940 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2945 if (ndopts.nd_opts_tgt_lladdr) {
2946 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2949 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2954 rt = (struct rt6_info *) dst;
2955 if (rt->rt6i_flags & RTF_REJECT) {
2956 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2960 /* Redirect received -> path was valid.
2961 * Look, redirects are sent only in response to data packets,
2962 * so that this nexthop apparently is reachable. --ANK
2964 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2966 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2971 * We have finally decided to accept it.
2974 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2975 NEIGH_UPDATE_F_WEAK_OVERRIDE|
2976 NEIGH_UPDATE_F_OVERRIDE|
2977 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2978 NEIGH_UPDATE_F_ISROUTER)),
2979 NDISC_REDIRECT, &ndopts);
2981 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2985 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2987 nrt->rt6i_flags &= ~RTF_GATEWAY;
2989 nrt->rt6i_protocol = RTPROT_REDIRECT;
2990 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2992 /* No need to remove rt from the exception table if rt is
2993 * a cached route because rt6_insert_exception() will
2996 if (rt6_insert_exception(nrt, rt)) {
2997 dst_release_immediate(&nrt->dst);
3001 netevent.old = &rt->dst;
3002 netevent.new = &nrt->dst;
3003 netevent.daddr = &msg->dest;
3004 netevent.neigh = neigh;
3005 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3008 neigh_release(neigh);
3012 * Misc support functions
3015 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3017 BUG_ON(from->dst.from);
3019 rt->rt6i_flags &= ~RTF_EXPIRES;
3020 dst_hold(&from->dst);
3021 rt->dst.from = &from->dst;
3022 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3025 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3027 rt->dst.input = ort->dst.input;
3028 rt->dst.output = ort->dst.output;
3029 rt->rt6i_dst = ort->rt6i_dst;
3030 rt->dst.error = ort->dst.error;
3031 rt->rt6i_idev = ort->rt6i_idev;
3033 in6_dev_hold(rt->rt6i_idev);
3034 rt->dst.lastuse = jiffies;
3035 rt->rt6i_gateway = ort->rt6i_gateway;
3036 rt->rt6i_flags = ort->rt6i_flags;
3037 rt6_set_from(rt, ort);
3038 rt->rt6i_metric = ort->rt6i_metric;
3039 #ifdef CONFIG_IPV6_SUBTREES
3040 rt->rt6i_src = ort->rt6i_src;
3042 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3043 rt->rt6i_table = ort->rt6i_table;
3044 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3047 #ifdef CONFIG_IPV6_ROUTE_INFO
3048 static struct rt6_info *rt6_get_route_info(struct net *net,
3049 const struct in6_addr *prefix, int prefixlen,
3050 const struct in6_addr *gwaddr,
3051 struct net_device *dev)
3053 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3054 int ifindex = dev->ifindex;
3055 struct fib6_node *fn;
3056 struct rt6_info *rt = NULL;
3057 struct fib6_table *table;
3059 table = fib6_get_table(net, tb_id);
3063 read_lock_bh(&table->tb6_lock);
3064 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3068 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
3069 if (rt->dst.dev->ifindex != ifindex)
3071 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3073 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3075 ip6_hold_safe(NULL, &rt, false);
3079 read_unlock_bh(&table->tb6_lock);
3083 static struct rt6_info *rt6_add_route_info(struct net *net,
3084 const struct in6_addr *prefix, int prefixlen,
3085 const struct in6_addr *gwaddr,
3086 struct net_device *dev,
3089 struct fib6_config cfg = {
3090 .fc_metric = IP6_RT_PRIO_USER,
3091 .fc_ifindex = dev->ifindex,
3092 .fc_dst_len = prefixlen,
3093 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3094 RTF_UP | RTF_PREF(pref),
3095 .fc_protocol = RTPROT_RA,
3096 .fc_nlinfo.portid = 0,
3097 .fc_nlinfo.nlh = NULL,
3098 .fc_nlinfo.nl_net = net,
3101 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3102 cfg.fc_dst = *prefix;
3103 cfg.fc_gateway = *gwaddr;
3105 /* We should treat it as a default route if prefix length is 0. */
3107 cfg.fc_flags |= RTF_DEFAULT;
3109 ip6_route_add(&cfg, NULL);
3111 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3115 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3117 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3118 struct rt6_info *rt;
3119 struct fib6_table *table;
3121 table = fib6_get_table(dev_net(dev), tb_id);
3125 read_lock_bh(&table->tb6_lock);
3126 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
3127 if (dev == rt->dst.dev &&
3128 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3129 ipv6_addr_equal(&rt->rt6i_gateway, addr))
3133 ip6_hold_safe(NULL, &rt, false);
3134 read_unlock_bh(&table->tb6_lock);
3138 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3139 struct net_device *dev,
3142 struct fib6_config cfg = {
3143 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3144 .fc_metric = IP6_RT_PRIO_USER,
3145 .fc_ifindex = dev->ifindex,
3146 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3147 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3148 .fc_protocol = RTPROT_RA,
3149 .fc_nlinfo.portid = 0,
3150 .fc_nlinfo.nlh = NULL,
3151 .fc_nlinfo.nl_net = dev_net(dev),
3154 cfg.fc_gateway = *gwaddr;
3156 if (!ip6_route_add(&cfg, NULL)) {
3157 struct fib6_table *table;
3159 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3161 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3164 return rt6_get_dflt_router(gwaddr, dev);
3167 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3169 struct rt6_info *rt;
3172 read_lock_bh(&table->tb6_lock);
3173 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
3174 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3175 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3176 if (dst_hold_safe(&rt->dst)) {
3177 read_unlock_bh(&table->tb6_lock);
3180 read_unlock_bh(&table->tb6_lock);
3185 read_unlock_bh(&table->tb6_lock);
3187 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3190 void rt6_purge_dflt_routers(struct net *net)
3192 struct fib6_table *table;
3193 struct hlist_head *head;
3198 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3199 head = &net->ipv6.fib_table_hash[h];
3200 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3201 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3202 __rt6_purge_dflt_routers(table);
3209 static void rtmsg_to_fib6_config(struct net *net,
3210 struct in6_rtmsg *rtmsg,
3211 struct fib6_config *cfg)
3213 memset(cfg, 0, sizeof(*cfg));
3215 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3217 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3218 cfg->fc_metric = rtmsg->rtmsg_metric;
3219 cfg->fc_expires = rtmsg->rtmsg_info;
3220 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3221 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3222 cfg->fc_flags = rtmsg->rtmsg_flags;
3224 cfg->fc_nlinfo.nl_net = net;
3226 cfg->fc_dst = rtmsg->rtmsg_dst;
3227 cfg->fc_src = rtmsg->rtmsg_src;
3228 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3231 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3233 struct fib6_config cfg;
3234 struct in6_rtmsg rtmsg;
3238 case SIOCADDRT: /* Add a route */
3239 case SIOCDELRT: /* Delete a route */
3240 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3242 err = copy_from_user(&rtmsg, arg,
3243 sizeof(struct in6_rtmsg));
3247 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3252 err = ip6_route_add(&cfg, NULL);
3255 err = ip6_route_del(&cfg, NULL);
3269 * Drop the packet on the floor
3272 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3275 struct dst_entry *dst = skb_dst(skb);
3276 switch (ipstats_mib_noroutes) {
3277 case IPSTATS_MIB_INNOROUTES:
3278 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3279 if (type == IPV6_ADDR_ANY) {
3280 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3281 IPSTATS_MIB_INADDRERRORS);
3285 case IPSTATS_MIB_OUTNOROUTES:
3286 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3287 ipstats_mib_noroutes);
3290 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3295 static int ip6_pkt_discard(struct sk_buff *skb)
3297 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3300 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3302 skb->dev = skb_dst(skb)->dev;
3303 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3306 static int ip6_pkt_prohibit(struct sk_buff *skb)
3308 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3311 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3313 skb->dev = skb_dst(skb)->dev;
3314 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3318 * Allocate a dst for local (unicast / anycast) address.
3321 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3322 const struct in6_addr *addr,
3326 struct net *net = dev_net(idev->dev);
3327 struct net_device *dev = idev->dev;
3328 struct rt6_info *rt;
3330 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3332 return ERR_PTR(-ENOMEM);
3336 rt->dst.flags |= DST_HOST;
3337 rt->dst.input = ip6_input;
3338 rt->dst.output = ip6_output;
3339 rt->rt6i_idev = idev;
3341 rt->rt6i_protocol = RTPROT_KERNEL;
3342 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3344 rt->rt6i_flags |= RTF_ANYCAST;
3346 rt->rt6i_flags |= RTF_LOCAL;
3348 rt->rt6i_gateway = *addr;
3349 rt->rt6i_dst.addr = *addr;
3350 rt->rt6i_dst.plen = 128;
3351 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3352 rt->rt6i_table = fib6_get_table(net, tb_id);
3357 /* remove deleted ip from prefsrc entries */
3358 struct arg_dev_net_ip {
3359 struct net_device *dev;
3361 struct in6_addr *addr;
3364 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3366 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3367 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3368 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3370 if (((void *)rt->dst.dev == dev || !dev) &&
3371 rt != net->ipv6.ip6_null_entry &&
3372 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3373 spin_lock_bh(&rt6_exception_lock);
3374 /* remove prefsrc entry */
3375 rt->rt6i_prefsrc.plen = 0;
3376 /* need to update cache as well */
3377 rt6_exceptions_remove_prefsrc(rt);
3378 spin_unlock_bh(&rt6_exception_lock);
3383 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3385 struct net *net = dev_net(ifp->idev->dev);
3386 struct arg_dev_net_ip adni = {
3387 .dev = ifp->idev->dev,
3391 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3394 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3396 /* Remove routers and update dst entries when gateway turn into host. */
3397 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3399 struct in6_addr *gateway = (struct in6_addr *)arg;
3401 if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3402 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3406 /* Further clean up cached routes in exception table.
3407 * This is needed because cached route may have a different
3408 * gateway than its 'parent' in the case of an ip redirect.
3410 rt6_exceptions_clean_tohost(rt, gateway);
3415 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3417 fib6_clean_all(net, fib6_clean_tohost, gateway);
3420 struct arg_dev_net {
3421 struct net_device *dev;
3425 /* called with write lock held for table with rt */
3426 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3428 const struct arg_dev_net *adn = arg;
3429 const struct net_device *dev = adn->dev;
3431 if ((rt->dst.dev == dev || !dev) &&
3432 rt != adn->net->ipv6.ip6_null_entry &&
3433 (rt->rt6i_nsiblings == 0 ||
3434 (dev && netdev_unregistering(dev)) ||
3435 !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3441 void rt6_ifdown(struct net *net, struct net_device *dev)
3443 struct arg_dev_net adn = {
3448 fib6_clean_all(net, fib6_ifdown, &adn);
3450 rt6_uncached_list_flush_dev(net, dev);
3453 struct rt6_mtu_change_arg {
3454 struct net_device *dev;
3458 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3460 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3461 struct inet6_dev *idev;
3463 /* In IPv6 pmtu discovery is not optional,
3464 so that RTAX_MTU lock cannot disable it.
3465 We still use this lock to block changes
3466 caused by addrconf/ndisc.
3469 idev = __in6_dev_get(arg->dev);
3473 /* For administrative MTU increase, there is no way to discover
3474 IPv6 PMTU increase, so PMTU increase should be updated here.
3475 Since RFC 1981 doesn't include administrative MTU increase
3476 update PMTU increase is a MUST. (i.e. jumbo frame)
3479 If new MTU is less than route PMTU, this new MTU will be the
3480 lowest MTU in the path, update the route PMTU to reflect PMTU
3481 decreases; if new MTU is greater than route PMTU, and the
3482 old MTU is the lowest MTU in the path, update the route PMTU
3483 to reflect the increase. In this case if the other nodes' MTU
3484 also have the lowest MTU, TOO BIG MESSAGE will be lead to
3487 if (rt->dst.dev == arg->dev &&
3488 dst_metric_raw(&rt->dst, RTAX_MTU) &&
3489 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3490 spin_lock_bh(&rt6_exception_lock);
3491 if (dst_mtu(&rt->dst) >= arg->mtu ||
3492 (dst_mtu(&rt->dst) < arg->mtu &&
3493 dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3494 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3496 rt6_exceptions_update_pmtu(rt, arg->mtu);
3497 spin_unlock_bh(&rt6_exception_lock);
3502 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3504 struct rt6_mtu_change_arg arg = {
3509 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3512 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3513 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
3514 [RTA_OIF] = { .type = NLA_U32 },
3515 [RTA_IIF] = { .type = NLA_U32 },
3516 [RTA_PRIORITY] = { .type = NLA_U32 },
3517 [RTA_METRICS] = { .type = NLA_NESTED },
3518 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
3519 [RTA_PREF] = { .type = NLA_U8 },
3520 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
3521 [RTA_ENCAP] = { .type = NLA_NESTED },
3522 [RTA_EXPIRES] = { .type = NLA_U32 },
3523 [RTA_UID] = { .type = NLA_U32 },
3524 [RTA_MARK] = { .type = NLA_U32 },
3527 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3528 struct fib6_config *cfg,
3529 struct netlink_ext_ack *extack)
3532 struct nlattr *tb[RTA_MAX+1];
3536 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3542 rtm = nlmsg_data(nlh);
3543 memset(cfg, 0, sizeof(*cfg));
3545 cfg->fc_table = rtm->rtm_table;
3546 cfg->fc_dst_len = rtm->rtm_dst_len;
3547 cfg->fc_src_len = rtm->rtm_src_len;
3548 cfg->fc_flags = RTF_UP;
3549 cfg->fc_protocol = rtm->rtm_protocol;
3550 cfg->fc_type = rtm->rtm_type;
3552 if (rtm->rtm_type == RTN_UNREACHABLE ||
3553 rtm->rtm_type == RTN_BLACKHOLE ||
3554 rtm->rtm_type == RTN_PROHIBIT ||
3555 rtm->rtm_type == RTN_THROW)
3556 cfg->fc_flags |= RTF_REJECT;
3558 if (rtm->rtm_type == RTN_LOCAL)
3559 cfg->fc_flags |= RTF_LOCAL;
3561 if (rtm->rtm_flags & RTM_F_CLONED)
3562 cfg->fc_flags |= RTF_CACHE;
3564 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3565 cfg->fc_nlinfo.nlh = nlh;
3566 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3568 if (tb[RTA_GATEWAY]) {
3569 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3570 cfg->fc_flags |= RTF_GATEWAY;
3574 int plen = (rtm->rtm_dst_len + 7) >> 3;
3576 if (nla_len(tb[RTA_DST]) < plen)
3579 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3583 int plen = (rtm->rtm_src_len + 7) >> 3;
3585 if (nla_len(tb[RTA_SRC]) < plen)
3588 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3591 if (tb[RTA_PREFSRC])
3592 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3595 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3597 if (tb[RTA_PRIORITY])
3598 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3600 if (tb[RTA_METRICS]) {
3601 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3602 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3606 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3608 if (tb[RTA_MULTIPATH]) {
3609 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3610 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3612 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3613 cfg->fc_mp_len, extack);
3619 pref = nla_get_u8(tb[RTA_PREF]);
3620 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3621 pref != ICMPV6_ROUTER_PREF_HIGH)
3622 pref = ICMPV6_ROUTER_PREF_MEDIUM;
3623 cfg->fc_flags |= RTF_PREF(pref);
3627 cfg->fc_encap = tb[RTA_ENCAP];
3629 if (tb[RTA_ENCAP_TYPE]) {
3630 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3632 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3637 if (tb[RTA_EXPIRES]) {
3638 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3640 if (addrconf_finite_timeout(timeout)) {
3641 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3642 cfg->fc_flags |= RTF_EXPIRES;
3652 struct rt6_info *rt6_info;
3653 struct fib6_config r_cfg;
3654 struct mx6_config mxc;
3655 struct list_head next;
3658 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3662 list_for_each_entry(nh, rt6_nh_list, next) {
3663 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3664 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3665 nh->r_cfg.fc_ifindex);
3669 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3670 struct rt6_info *rt, struct fib6_config *r_cfg)
3675 list_for_each_entry(nh, rt6_nh_list, next) {
3676 /* check if rt6_info already exists */
3677 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3681 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3685 err = ip6_convert_metrics(&nh->mxc, r_cfg);
3690 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3691 list_add_tail(&nh->next, rt6_nh_list);
3696 static void ip6_route_mpath_notify(struct rt6_info *rt,
3697 struct rt6_info *rt_last,
3698 struct nl_info *info,
3701 /* if this is an APPEND route, then rt points to the first route
3702 * inserted and rt_last points to last route inserted. Userspace
3703 * wants a consistent dump of the route which starts at the first
3704 * nexthop. Since sibling routes are always added at the end of
3705 * the list, find the first sibling of the last route appended
3707 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3708 rt = list_first_entry(&rt_last->rt6i_siblings,
3714 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3717 static int ip6_route_multipath_add(struct fib6_config *cfg,
3718 struct netlink_ext_ack *extack)
3720 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3721 struct nl_info *info = &cfg->fc_nlinfo;
3722 struct fib6_config r_cfg;
3723 struct rtnexthop *rtnh;
3724 struct rt6_info *rt;
3725 struct rt6_nh *err_nh;
3726 struct rt6_nh *nh, *nh_safe;
3732 int replace = (cfg->fc_nlinfo.nlh &&
3733 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3734 LIST_HEAD(rt6_nh_list);
3736 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3737 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3738 nlflags |= NLM_F_APPEND;
3740 remaining = cfg->fc_mp_len;
3741 rtnh = (struct rtnexthop *)cfg->fc_mp;
3743 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3744 * rt6_info structs per nexthop
3746 while (rtnh_ok(rtnh, remaining)) {
3747 memcpy(&r_cfg, cfg, sizeof(*cfg));
3748 if (rtnh->rtnh_ifindex)
3749 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3751 attrlen = rtnh_attrlen(rtnh);
3753 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3755 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3757 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3758 r_cfg.fc_flags |= RTF_GATEWAY;
3760 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3761 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3763 r_cfg.fc_encap_type = nla_get_u16(nla);
3766 rt = ip6_route_info_create(&r_cfg, extack);
3773 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3775 dst_release_immediate(&rt->dst);
3779 rtnh = rtnh_next(rtnh, &remaining);
3782 /* for add and replace send one notification with all nexthops.
3783 * Skip the notification in fib6_add_rt2node and send one with
3784 * the full route when done
3786 info->skip_notify = 1;
3789 list_for_each_entry(nh, &rt6_nh_list, next) {
3790 rt_last = nh->rt6_info;
3791 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3792 /* save reference to first route for notification */
3793 if (!rt_notif && !err)
3794 rt_notif = nh->rt6_info;
3796 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3797 nh->rt6_info = NULL;
3800 ip6_print_replace_route_err(&rt6_nh_list);
3805 /* Because each route is added like a single route we remove
3806 * these flags after the first nexthop: if there is a collision,
3807 * we have already failed to add the first nexthop:
3808 * fib6_add_rt2node() has rejected it; when replacing, old
3809 * nexthops have been replaced by first new, the rest should
3812 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3817 /* success ... tell user about new route */
3818 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3822 /* send notification for routes that were added so that
3823 * the delete notifications sent by ip6_route_del are
3827 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3829 /* Delete routes that were already added */
3830 list_for_each_entry(nh, &rt6_nh_list, next) {
3833 ip6_route_del(&nh->r_cfg, extack);
3837 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3839 dst_release_immediate(&nh->rt6_info->dst);
3841 list_del(&nh->next);
3848 static int ip6_route_multipath_del(struct fib6_config *cfg,
3849 struct netlink_ext_ack *extack)
3851 struct fib6_config r_cfg;
3852 struct rtnexthop *rtnh;
3855 int err = 1, last_err = 0;
3857 remaining = cfg->fc_mp_len;
3858 rtnh = (struct rtnexthop *)cfg->fc_mp;
3860 /* Parse a Multipath Entry */
3861 while (rtnh_ok(rtnh, remaining)) {
3862 memcpy(&r_cfg, cfg, sizeof(*cfg));
3863 if (rtnh->rtnh_ifindex)
3864 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3866 attrlen = rtnh_attrlen(rtnh);
3868 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3870 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3872 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3873 r_cfg.fc_flags |= RTF_GATEWAY;
3876 err = ip6_route_del(&r_cfg, extack);
3880 rtnh = rtnh_next(rtnh, &remaining);
3886 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3887 struct netlink_ext_ack *extack)
3889 struct fib6_config cfg;
3892 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3897 return ip6_route_multipath_del(&cfg, extack);
3899 cfg.fc_delete_all_nh = 1;
3900 return ip6_route_del(&cfg, extack);
3904 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3905 struct netlink_ext_ack *extack)
3907 struct fib6_config cfg;
3910 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3915 return ip6_route_multipath_add(&cfg, extack);
3917 return ip6_route_add(&cfg, extack);
3920 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3922 int nexthop_len = 0;
3924 if (rt->rt6i_nsiblings) {
3925 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
3926 + NLA_ALIGN(sizeof(struct rtnexthop))
3927 + nla_total_size(16) /* RTA_GATEWAY */
3928 + lwtunnel_get_encap_size(rt->dst.lwtstate);
3930 nexthop_len *= rt->rt6i_nsiblings;
3933 return NLMSG_ALIGN(sizeof(struct rtmsg))
3934 + nla_total_size(16) /* RTA_SRC */
3935 + nla_total_size(16) /* RTA_DST */
3936 + nla_total_size(16) /* RTA_GATEWAY */
3937 + nla_total_size(16) /* RTA_PREFSRC */
3938 + nla_total_size(4) /* RTA_TABLE */
3939 + nla_total_size(4) /* RTA_IIF */
3940 + nla_total_size(4) /* RTA_OIF */
3941 + nla_total_size(4) /* RTA_PRIORITY */
3942 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3943 + nla_total_size(sizeof(struct rta_cacheinfo))
3944 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3945 + nla_total_size(1) /* RTA_PREF */
3946 + lwtunnel_get_encap_size(rt->dst.lwtstate)
3950 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3951 unsigned int *flags, bool skip_oif)
3953 if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3954 *flags |= RTNH_F_LINKDOWN;
3955 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3956 *flags |= RTNH_F_DEAD;
3959 if (rt->rt6i_flags & RTF_GATEWAY) {
3960 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3961 goto nla_put_failure;
3964 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
3965 *flags |= RTNH_F_OFFLOAD;
3967 /* not needed for multipath encoding b/c it has a rtnexthop struct */
3968 if (!skip_oif && rt->dst.dev &&
3969 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3970 goto nla_put_failure;
3972 if (rt->dst.lwtstate &&
3973 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3974 goto nla_put_failure;
3982 /* add multipath next hop */
3983 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3985 struct rtnexthop *rtnh;
3986 unsigned int flags = 0;
3988 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3990 goto nla_put_failure;
3992 rtnh->rtnh_hops = 0;
3993 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3995 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3996 goto nla_put_failure;
3998 rtnh->rtnh_flags = flags;
4000 /* length of rtnetlink header + attributes */
4001 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4009 static int rt6_fill_node(struct net *net,
4010 struct sk_buff *skb, struct rt6_info *rt,
4011 struct in6_addr *dst, struct in6_addr *src,
4012 int iif, int type, u32 portid, u32 seq,
4015 u32 metrics[RTAX_MAX];
4017 struct nlmsghdr *nlh;
4021 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4025 rtm = nlmsg_data(nlh);
4026 rtm->rtm_family = AF_INET6;
4027 rtm->rtm_dst_len = rt->rt6i_dst.plen;
4028 rtm->rtm_src_len = rt->rt6i_src.plen;
4031 table = rt->rt6i_table->tb6_id;
4033 table = RT6_TABLE_UNSPEC;
4034 rtm->rtm_table = table;
4035 if (nla_put_u32(skb, RTA_TABLE, table))
4036 goto nla_put_failure;
4037 if (rt->rt6i_flags & RTF_REJECT) {
4038 switch (rt->dst.error) {
4040 rtm->rtm_type = RTN_BLACKHOLE;
4043 rtm->rtm_type = RTN_PROHIBIT;
4046 rtm->rtm_type = RTN_THROW;
4049 rtm->rtm_type = RTN_UNREACHABLE;
4053 else if (rt->rt6i_flags & RTF_LOCAL)
4054 rtm->rtm_type = RTN_LOCAL;
4055 else if (rt->rt6i_flags & RTF_ANYCAST)
4056 rtm->rtm_type = RTN_ANYCAST;
4057 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4058 rtm->rtm_type = RTN_LOCAL;
4060 rtm->rtm_type = RTN_UNICAST;
4062 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4063 rtm->rtm_protocol = rt->rt6i_protocol;
4065 if (rt->rt6i_flags & RTF_CACHE)
4066 rtm->rtm_flags |= RTM_F_CLONED;
4069 if (nla_put_in6_addr(skb, RTA_DST, dst))
4070 goto nla_put_failure;
4071 rtm->rtm_dst_len = 128;
4072 } else if (rtm->rtm_dst_len)
4073 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4074 goto nla_put_failure;
4075 #ifdef CONFIG_IPV6_SUBTREES
4077 if (nla_put_in6_addr(skb, RTA_SRC, src))
4078 goto nla_put_failure;
4079 rtm->rtm_src_len = 128;
4080 } else if (rtm->rtm_src_len &&
4081 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4082 goto nla_put_failure;
4085 #ifdef CONFIG_IPV6_MROUTE
4086 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4087 int err = ip6mr_get_route(net, skb, rtm, portid);
4092 goto nla_put_failure;
4095 if (nla_put_u32(skb, RTA_IIF, iif))
4096 goto nla_put_failure;
4098 struct in6_addr saddr_buf;
4099 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4100 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4101 goto nla_put_failure;
4104 if (rt->rt6i_prefsrc.plen) {
4105 struct in6_addr saddr_buf;
4106 saddr_buf = rt->rt6i_prefsrc.addr;
4107 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4108 goto nla_put_failure;
4111 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4113 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4114 if (rtnetlink_put_metrics(skb, metrics) < 0)
4115 goto nla_put_failure;
4117 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4118 goto nla_put_failure;
4120 /* For multipath routes, walk the siblings list and add
4121 * each as a nexthop within RTA_MULTIPATH.
4123 if (rt->rt6i_nsiblings) {
4124 struct rt6_info *sibling, *next_sibling;
4127 mp = nla_nest_start(skb, RTA_MULTIPATH);
4129 goto nla_put_failure;
4131 if (rt6_add_nexthop(skb, rt) < 0)
4132 goto nla_put_failure;
4134 list_for_each_entry_safe(sibling, next_sibling,
4135 &rt->rt6i_siblings, rt6i_siblings) {
4136 if (rt6_add_nexthop(skb, sibling) < 0)
4137 goto nla_put_failure;
4140 nla_nest_end(skb, mp);
4142 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4143 goto nla_put_failure;
4146 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4148 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4149 goto nla_put_failure;
4151 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4152 goto nla_put_failure;
4155 nlmsg_end(skb, nlh);
4159 nlmsg_cancel(skb, nlh);
4163 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4165 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4166 struct net *net = arg->net;
4168 if (rt == net->ipv6.ip6_null_entry)
4171 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4172 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4174 /* user wants prefix routes only */
4175 if (rtm->rtm_flags & RTM_F_PREFIX &&
4176 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4177 /* success since this is not a prefix route */
4182 return rt6_fill_node(net,
4183 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4184 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4188 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4189 struct netlink_ext_ack *extack)
4191 struct net *net = sock_net(in_skb->sk);
4192 struct nlattr *tb[RTA_MAX+1];
4193 int err, iif = 0, oif = 0;
4194 struct dst_entry *dst;
4195 struct rt6_info *rt;
4196 struct sk_buff *skb;
4201 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4207 memset(&fl6, 0, sizeof(fl6));
4208 rtm = nlmsg_data(nlh);
4209 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4210 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4213 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4216 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4220 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4223 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4227 iif = nla_get_u32(tb[RTA_IIF]);
4230 oif = nla_get_u32(tb[RTA_OIF]);
4233 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4236 fl6.flowi6_uid = make_kuid(current_user_ns(),
4237 nla_get_u32(tb[RTA_UID]));
4239 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4242 struct net_device *dev;
4247 dev = dev_get_by_index_rcu(net, iif);
4254 fl6.flowi6_iif = iif;
4256 if (!ipv6_addr_any(&fl6.saddr))
4257 flags |= RT6_LOOKUP_F_HAS_SADDR;
4260 dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4262 dst = ip6_route_lookup(net, &fl6, 0);
4266 fl6.flowi6_oif = oif;
4269 dst = ip6_route_output(net, NULL, &fl6);
4271 dst = ip6_route_lookup(net, &fl6, 0);
4275 rt = container_of(dst, struct rt6_info, dst);
4276 if (rt->dst.error) {
4277 err = rt->dst.error;
4282 if (rt == net->ipv6.ip6_null_entry) {
4283 err = rt->dst.error;
4288 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4295 skb_dst_set(skb, &rt->dst);
4297 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4298 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4301 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4302 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4309 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4314 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4315 unsigned int nlm_flags)
4317 struct sk_buff *skb;
4318 struct net *net = info->nl_net;
4323 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4325 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4329 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4330 event, info->portid, seq, nlm_flags);
4332 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4333 WARN_ON(err == -EMSGSIZE);
4337 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4338 info->nlh, gfp_any());
4342 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4345 static int ip6_route_dev_notify(struct notifier_block *this,
4346 unsigned long event, void *ptr)
4348 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4349 struct net *net = dev_net(dev);
4351 if (!(dev->flags & IFF_LOOPBACK))
4354 if (event == NETDEV_REGISTER) {
4355 net->ipv6.ip6_null_entry->dst.dev = dev;
4356 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4357 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4358 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4359 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4360 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4361 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4363 } else if (event == NETDEV_UNREGISTER &&
4364 dev->reg_state != NETREG_UNREGISTERED) {
4365 /* NETDEV_UNREGISTER could be fired for multiple times by
4366 * netdev_wait_allrefs(). Make sure we only call this once.
4368 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4369 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4370 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4371 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4382 #ifdef CONFIG_PROC_FS
4384 static const struct file_operations ipv6_route_proc_fops = {
4385 .owner = THIS_MODULE,
4386 .open = ipv6_route_open,
4388 .llseek = seq_lseek,
4389 .release = seq_release_net,
4392 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4394 struct net *net = (struct net *)seq->private;
4395 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4396 net->ipv6.rt6_stats->fib_nodes,
4397 net->ipv6.rt6_stats->fib_route_nodes,
4398 net->ipv6.rt6_stats->fib_rt_alloc,
4399 net->ipv6.rt6_stats->fib_rt_entries,
4400 net->ipv6.rt6_stats->fib_rt_cache,
4401 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4402 net->ipv6.rt6_stats->fib_discarded_routes);
4407 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4409 return single_open_net(inode, file, rt6_stats_seq_show);
4412 static const struct file_operations rt6_stats_seq_fops = {
4413 .owner = THIS_MODULE,
4414 .open = rt6_stats_seq_open,
4416 .llseek = seq_lseek,
4417 .release = single_release_net,
4419 #endif /* CONFIG_PROC_FS */
4421 #ifdef CONFIG_SYSCTL
4424 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4425 void __user *buffer, size_t *lenp, loff_t *ppos)
4432 net = (struct net *)ctl->extra1;
4433 delay = net->ipv6.sysctl.flush_delay;
4434 proc_dointvec(ctl, write, buffer, lenp, ppos);
4435 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4439 struct ctl_table ipv6_route_table_template[] = {
4441 .procname = "flush",
4442 .data = &init_net.ipv6.sysctl.flush_delay,
4443 .maxlen = sizeof(int),
4445 .proc_handler = ipv6_sysctl_rtcache_flush
4448 .procname = "gc_thresh",
4449 .data = &ip6_dst_ops_template.gc_thresh,
4450 .maxlen = sizeof(int),
4452 .proc_handler = proc_dointvec,
4455 .procname = "max_size",
4456 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
4457 .maxlen = sizeof(int),
4459 .proc_handler = proc_dointvec,
4462 .procname = "gc_min_interval",
4463 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4464 .maxlen = sizeof(int),
4466 .proc_handler = proc_dointvec_jiffies,
4469 .procname = "gc_timeout",
4470 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4471 .maxlen = sizeof(int),
4473 .proc_handler = proc_dointvec_jiffies,
4476 .procname = "gc_interval",
4477 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4478 .maxlen = sizeof(int),
4480 .proc_handler = proc_dointvec_jiffies,
4483 .procname = "gc_elasticity",
4484 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4485 .maxlen = sizeof(int),
4487 .proc_handler = proc_dointvec,
4490 .procname = "mtu_expires",
4491 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4492 .maxlen = sizeof(int),
4494 .proc_handler = proc_dointvec_jiffies,
4497 .procname = "min_adv_mss",
4498 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4499 .maxlen = sizeof(int),
4501 .proc_handler = proc_dointvec,
4504 .procname = "gc_min_interval_ms",
4505 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4506 .maxlen = sizeof(int),
4508 .proc_handler = proc_dointvec_ms_jiffies,
4513 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4515 struct ctl_table *table;
4517 table = kmemdup(ipv6_route_table_template,
4518 sizeof(ipv6_route_table_template),
4522 table[0].data = &net->ipv6.sysctl.flush_delay;
4523 table[0].extra1 = net;
4524 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4525 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4526 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4527 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4528 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4529 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4530 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4531 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4532 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4534 /* Don't export sysctls to unprivileged users */
4535 if (net->user_ns != &init_user_ns)
4536 table[0].procname = NULL;
4543 static int __net_init ip6_route_net_init(struct net *net)
4547 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4548 sizeof(net->ipv6.ip6_dst_ops));
4550 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4551 goto out_ip6_dst_ops;
4553 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4554 sizeof(*net->ipv6.ip6_null_entry),
4556 if (!net->ipv6.ip6_null_entry)
4557 goto out_ip6_dst_entries;
4558 net->ipv6.ip6_null_entry->dst.path =
4559 (struct dst_entry *)net->ipv6.ip6_null_entry;
4560 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4561 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4562 ip6_template_metrics, true);
4564 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4565 net->ipv6.fib6_has_custom_rules = false;
4566 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4567 sizeof(*net->ipv6.ip6_prohibit_entry),
4569 if (!net->ipv6.ip6_prohibit_entry)
4570 goto out_ip6_null_entry;
4571 net->ipv6.ip6_prohibit_entry->dst.path =
4572 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4573 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4574 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4575 ip6_template_metrics, true);
4577 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4578 sizeof(*net->ipv6.ip6_blk_hole_entry),
4580 if (!net->ipv6.ip6_blk_hole_entry)
4581 goto out_ip6_prohibit_entry;
4582 net->ipv6.ip6_blk_hole_entry->dst.path =
4583 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4584 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4585 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4586 ip6_template_metrics, true);
4589 net->ipv6.sysctl.flush_delay = 0;
4590 net->ipv6.sysctl.ip6_rt_max_size = 4096;
4591 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4592 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4593 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4594 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4595 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4596 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4598 net->ipv6.ip6_rt_gc_expire = 30*HZ;
4604 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4605 out_ip6_prohibit_entry:
4606 kfree(net->ipv6.ip6_prohibit_entry);
4608 kfree(net->ipv6.ip6_null_entry);
4610 out_ip6_dst_entries:
4611 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4616 static void __net_exit ip6_route_net_exit(struct net *net)
4618 kfree(net->ipv6.ip6_null_entry);
4619 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4620 kfree(net->ipv6.ip6_prohibit_entry);
4621 kfree(net->ipv6.ip6_blk_hole_entry);
4623 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4626 static int __net_init ip6_route_net_init_late(struct net *net)
4628 #ifdef CONFIG_PROC_FS
4629 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4630 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4635 static void __net_exit ip6_route_net_exit_late(struct net *net)
4637 #ifdef CONFIG_PROC_FS
4638 remove_proc_entry("ipv6_route", net->proc_net);
4639 remove_proc_entry("rt6_stats", net->proc_net);
4643 static struct pernet_operations ip6_route_net_ops = {
4644 .init = ip6_route_net_init,
4645 .exit = ip6_route_net_exit,
4648 static int __net_init ipv6_inetpeer_init(struct net *net)
4650 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4654 inet_peer_base_init(bp);
4655 net->ipv6.peers = bp;
4659 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4661 struct inet_peer_base *bp = net->ipv6.peers;
4663 net->ipv6.peers = NULL;
4664 inetpeer_invalidate_tree(bp);
4668 static struct pernet_operations ipv6_inetpeer_ops = {
4669 .init = ipv6_inetpeer_init,
4670 .exit = ipv6_inetpeer_exit,
4673 static struct pernet_operations ip6_route_net_late_ops = {
4674 .init = ip6_route_net_init_late,
4675 .exit = ip6_route_net_exit_late,
4678 static struct notifier_block ip6_route_dev_notifier = {
4679 .notifier_call = ip6_route_dev_notify,
4680 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4683 void __init ip6_route_init_special_entries(void)
4685 /* Registering of the loopback is done before this portion of code,
4686 * the loopback reference in rt6_info will not be taken, do it
4687 * manually for init_net */
4688 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4689 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4690 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4691 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4692 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4693 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4694 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4698 int __init ip6_route_init(void)
4704 ip6_dst_ops_template.kmem_cachep =
4705 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4706 SLAB_HWCACHE_ALIGN, NULL);
4707 if (!ip6_dst_ops_template.kmem_cachep)
4710 ret = dst_entries_init(&ip6_dst_blackhole_ops);
4712 goto out_kmem_cache;
4714 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4716 goto out_dst_entries;
4718 ret = register_pernet_subsys(&ip6_route_net_ops);
4720 goto out_register_inetpeer;
4722 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4726 goto out_register_subsys;
4732 ret = fib6_rules_init();
4736 ret = register_pernet_subsys(&ip6_route_net_late_ops);
4738 goto fib6_rules_init;
4741 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4742 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4743 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4744 RTNL_FLAG_DOIT_UNLOCKED))
4745 goto out_register_late_subsys;
4747 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4749 goto out_register_late_subsys;
4751 for_each_possible_cpu(cpu) {
4752 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4754 INIT_LIST_HEAD(&ul->head);
4755 spin_lock_init(&ul->lock);
4761 out_register_late_subsys:
4762 unregister_pernet_subsys(&ip6_route_net_late_ops);
4764 fib6_rules_cleanup();
4769 out_register_subsys:
4770 unregister_pernet_subsys(&ip6_route_net_ops);
4771 out_register_inetpeer:
4772 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4774 dst_entries_destroy(&ip6_dst_blackhole_ops);
4776 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4780 void ip6_route_cleanup(void)
4782 unregister_netdevice_notifier(&ip6_route_dev_notifier);
4783 unregister_pernet_subsys(&ip6_route_net_late_ops);
4784 fib6_rules_cleanup();
4787 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4788 unregister_pernet_subsys(&ip6_route_net_ops);
4789 dst_entries_destroy(&ip6_dst_blackhole_ops);
4790 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);